From 252657a37423543f6d630f0a08206a729c64eaca Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 29 Apr 2026 09:55:14 +0000 Subject: [PATCH] include updating epc_property_data to pashub to ara workflow --- backend/documents_parser/db_writer.py | 76 +++++++++++++++++++++++ backend/documents_parser/parser.py | 13 ++++ backend/pashub_fetcher/handler/handler.py | 50 +++++++-------- 3 files changed, 114 insertions(+), 25 deletions(-) create mode 100644 backend/documents_parser/db_writer.py create mode 100644 backend/documents_parser/parser.py diff --git a/backend/documents_parser/db_writer.py b/backend/documents_parser/db_writer.py new file mode 100644 index 00000000..2039aabe --- /dev/null +++ b/backend/documents_parser/db_writer.py @@ -0,0 +1,76 @@ +from typing import Optional + +from sqlmodel import Session + +from backend.app.db.models.epc_property import ( + EpcBuildingPartModel, + EpcEnergyElementModel, + EpcFlatDetailsModel, + EpcFloorDimensionModel, + EpcMainHeatingDetailModel, + EpcPropertyEnergyPerformanceModel, + EpcPropertyModel, + EpcWindowModel, +) +from datatypes.epc.domain.epc_property_data import EpcPropertyData + + +def save_epc_property_data( + session: Session, + data: EpcPropertyData, + uploaded_file_id: Optional[int] = None, + property_id: Optional[int] = None, + portfolio_id: Optional[int] = None, +) -> EpcPropertyModel: + epc_prop = EpcPropertyModel.from_epc_property_data( + data, property_id=property_id, portfolio_id=portfolio_id + ) + epc_prop.uploaded_file_id = uploaded_file_id + session.add(epc_prop) + session.flush() + assert epc_prop.id is not None + epc_property_id: int = epc_prop.id + + session.add( + EpcPropertyEnergyPerformanceModel.from_epc_property_data( + data, epc_property_id=epc_property_id + ) + ) + + for detail in data.sap_heating.main_heating_details: + session.add(EpcMainHeatingDetailModel.from_domain(detail, epc_property_id)) + + for part in data.sap_building_parts: + bp = EpcBuildingPartModel.from_domain(part, epc_property_id) + session.add(bp) + session.flush() + assert bp.id is not None + for dim in part.sap_floor_dimensions: + session.add(EpcFloorDimensionModel.from_domain(dim, bp.id)) + + for window in data.sap_windows: + session.add(EpcWindowModel.from_domain(window, epc_property_id)) + + for el in data.roofs: + session.add(EpcEnergyElementModel.from_domain(el, "roof", epc_property_id)) + for el in data.walls: + session.add(EpcEnergyElementModel.from_domain(el, "wall", epc_property_id)) + for el in data.floors: + session.add(EpcEnergyElementModel.from_domain(el, "floor", epc_property_id)) + for el in data.main_heating: + session.add(EpcEnergyElementModel.from_domain(el, "main_heating", epc_property_id)) + + for el, etype in [ + (data.window, "window"), + (data.lighting, "lighting"), + (data.hot_water, "hot_water"), + (data.secondary_heating, "secondary_heating"), + (data.main_heating_controls, "main_heating_controls"), + ]: + if el is not None: + session.add(EpcEnergyElementModel.from_domain(el, etype, epc_property_id)) + + if data.sap_flat_details is not None: + session.add(EpcFlatDetailsModel.from_domain(data.sap_flat_details, epc_property_id)) + + return epc_prop diff --git a/backend/documents_parser/parser.py b/backend/documents_parser/parser.py new file mode 100644 index 00000000..0f6760d7 --- /dev/null +++ b/backend/documents_parser/parser.py @@ -0,0 +1,13 @@ +from datatypes.epc.domain.epc_property_data import EpcPropertyData +from datatypes.epc.domain.mapper import EpcPropertyDataMapper + +from backend.documents_parser.extractor import PasHubRdSapSiteNotesExtractor +from backend.documents_parser.pdf import pdf_to_text_list + + +def parse_pashub_site_notes(file_path: str) -> EpcPropertyData: + with open(file_path, "rb") as f: + pdf_bytes = f.read() + tokens = pdf_to_text_list(pdf_bytes) + site_notes = PasHubRdSapSiteNotesExtractor(tokens).extract() + return EpcPropertyDataMapper.from_site_notes(site_notes) diff --git a/backend/pashub_fetcher/handler/handler.py b/backend/pashub_fetcher/handler/handler.py index fc1f4f80..b9df216e 100644 --- a/backend/pashub_fetcher/handler/handler.py +++ b/backend/pashub_fetcher/handler/handler.py @@ -1,7 +1,7 @@ from datetime import datetime, timezone import os import re -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Tuple, cast from openpyxl import load_workbook from backend.app.config import get_settings @@ -11,7 +11,10 @@ from backend.app.db.models.uploaded_file import ( FileTypeEnum, UploadedFile, ) +from backend.documents_parser.db_writer import save_epc_property_data +from backend.documents_parser.parser import parse_pashub_site_notes from backend.pashub_fetcher.core_files import infer_file_type +from datatypes.epc.domain.epc_property_data import EpcPropertyData from backend.pashub_fetcher.job import Job from backend.pashub_fetcher.pashub_client import PashubClient, UnauthorizedError @@ -119,6 +122,7 @@ def upload_job_to_s3_and_update_db( ) uploaded_files: List[UploadedFile] = [] + site_notes_pairs: List[Tuple[UploadedFile, EpcPropertyData]] = [] for file_path in job_files: filename = os.path.basename(file_path) @@ -126,40 +130,36 @@ def upload_job_to_s3_and_update_db( upload_file_to_s3(file_path, bucket, file_key) - # load row to db # TODO: use same upload_file_to_s3_and_update_db method as ecmk fetcher does - uploaded_files.append( - UploadedFile( - s3_file_bucket=bucket, - s3_file_key=file_key, - s3_upload_timestamp=datetime.now(timezone.utc), - uprn=int(uprn) if uprn else None, - hubspot_deal_id=hubspot_deal_id, - file_source=FileSourceEnum.PAS_HUB.value, - file_type=infer_file_type(filename), - ) + uploaded_file = UploadedFile( + s3_file_bucket=bucket, + s3_file_key=file_key, + s3_upload_timestamp=datetime.now(timezone.utc), + uprn=int(uprn) if uprn else None, + hubspot_deal_id=hubspot_deal_id, + file_source=FileSourceEnum.PAS_HUB.value, + file_type=infer_file_type(filename), ) + uploaded_files.append(uploaded_file) + + file_type: Optional[str] = cast(Optional[str], uploaded_file.file_type) + if file_type is not None and FileTypeEnum(file_type) == FileTypeEnum.RD_SAP_SITE_NOTE: + try: + site_notes_pairs.append((uploaded_file, parse_pashub_site_notes(file_path))) + except Exception: + logger.warning(f"Failed to parse site notes {file_path}", exc_info=True) with db_session() as session: session.add_all(uploaded_files) - - # Ensure IDs are generated session.flush() - results = [ - {"file": file.s3_file_key, "type": file.file_type, "id": file.id} - for file in uploaded_files - ] + for uploaded_file, epc_data in site_notes_pairs: + save_epc_property_data( + session, epc_data, uploaded_file_id=cast(int, uploaded_file.id) + ) session.commit() - for result in results: - if FileTypeEnum(result["type"]) == FileTypeEnum.RD_SAP_SITE_NOTE: - # upload site notes to epc_property table - continue - - pass - def process_job( job: PashubToAraTriggerRequest,