From 0811339557a0ee572e55e520477eb7901fb97dbe Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 15 Jul 2025 15:15:42 +0000 Subject: [PATCH] summary report and epr with data is now identified --- etl/epr_etl_example.py | 1 + etl/fileReader/pdfReaderToText.py | 23 +++++++++++++++++------ etl/fileReader/reportType.py | 4 ++-- etl/fileReader/sitenotes.py | 25 +++++++++++++++++-------- etl/hubSpotClient/types.py | 7 +++++-- etl/surveyedData/surveryedData.py | 10 +++++++--- 6 files changed, 49 insertions(+), 21 deletions(-) diff --git a/etl/epr_etl_example.py b/etl/epr_etl_example.py index bd7601e..866c75a 100644 --- a/etl/epr_etl_example.py +++ b/etl/epr_etl_example.py @@ -2,6 +2,7 @@ from etl.surveyedData.surveryedData import surveyedDataProcessor files = [ "/tmp/sharepoint/Sandwell/SANDWELL-001/26 Willow close B64 6EG/Content (13).pdf", + "/tmp/sharepoint/Livewest/Livewest-001/12 Birch End/Summary Information 12 Birch End.pdf" ] from sqlalchemy.dialects.postgresql import UUID diff --git a/etl/fileReader/pdfReaderToText.py b/etl/fileReader/pdfReaderToText.py index 531c316..9668eed 100644 --- a/etl/fileReader/pdfReaderToText.py +++ b/etl/fileReader/pdfReaderToText.py @@ -1,7 +1,15 @@ from etl.utils.logger import Logger import logging import pymupdf -from etl.fileReader.sitenotes import QuidosSiteNotesExtractor, CSR, WarmHomesConditionReport, ECOConditionReport, RDSAPEnergyReport +from etl.fileReader.sitenotes import ( + QuidosSiteNotesExtractor, + CSR, + WarmHomesConditionReport, + ECOConditionReport, + EnergyPerformanceReportWithData, + EnergyPerformanceReportSummaryInformation + +) from etl.fileReader.reportType import ReportType from pprint import pprint @@ -46,8 +54,10 @@ class pdfReaderToText(): self.type = ReportType.WARM_HOMES_CONDITION_REPORT elif "Domna NEW PAS 2035 ECO Condition Report".lower() in self.text_list[0].lower(): self.type = ReportType.ECO_CONDITION_REPORT - elif "ENERGY REPORT".lower() == self.text_list[0].lower(): - self.type = ReportType.RDSAP_ENERGY_REPORT + elif "ENERGY REPORT".lower() == self.text_list[0].lower() and "Data inputs" in self.text_list: + self.type = ReportType.ENERGY_PERFORMANCE_REPORT_WITH_DATA + elif "Summary Information".lower() == self.text_list[0].lower(): + self.type = ReportType.ENERGY_PERFORMANCE_REPORT_SUMMARY_INFORMATION else: pass return self.type @@ -63,6 +73,7 @@ class pdfReaderToText(): return WarmHomesConditionReport(self.text_list) elif self.type == ReportType.ECO_CONDITION_REPORT: return ECOConditionReport(self.text_list) - elif self.type == ReportType.RDSAP_ENERGY_REPORT: - return RDSAPEnergyReport(self.text_list) - \ No newline at end of file + elif self.type == ReportType.ENERGY_PERFORMANCE_REPORT_WITH_DATA: + return EnergyPerformanceReportWithData(self.text_list) + elif self.type == ReportType.ENERGY_PERFORMANCE_REPORT_SUMMARY_INFORMATION: + return EnergyPerformanceReportSummaryInformation(self.text_list) \ No newline at end of file diff --git a/etl/fileReader/reportType.py b/etl/fileReader/reportType.py index 07ac12e..9fce283 100644 --- a/etl/fileReader/reportType.py +++ b/etl/fileReader/reportType.py @@ -4,12 +4,12 @@ from enum import Enum class ReportType(Enum): QUIDOS_PRESITE_NOTE = "quidos_presite_note" CHARTED_SURVEYOR_REPORT = "charted_surveyor_report" - ENERGY_PERFORMANCE_REPORT = "energy_performance_report" U_VALUE_CALCULATOR_REPORT = "u_value_calculator_report" OVERWRITING_U_VALUE_DECLARATION_FORM = "overwriting_u_value_declaration_form" ECO_CONDITION_REPORT = "osmosis_condition_pas_2035_report" WARM_HOMES_CONDITION_REPORT = "warm_homes_condition_pas_2035_report" - RDSAP_ENERGY_REPORT = "rdsap_energy_report" + ENERGY_PERFORMANCE_REPORT_WITH_DATA = "energy_performance_report_with_data" + ENERGY_PERFORMANCE_REPORT_SUMMARY_INFORMATION = "energy_performance_report_summary_information" LIG_XML = "lodgement_xml_needed_for_lodgement_to_like_trademark" RDSAP_XML = "reduce_xml_needed_to_generate_full_sap_xml" FULLSAP_XML = "full_xml_needed_for_co_ordination" diff --git a/etl/fileReader/sitenotes.py b/etl/fileReader/sitenotes.py index f46726c..1df8747 100644 --- a/etl/fileReader/sitenotes.py +++ b/etl/fileReader/sitenotes.py @@ -88,14 +88,7 @@ class CSR(SiteNotesExtractor): type=dict_.get('detailed_description_of_existing_cavity_wall_insulation_', "") ) if dict_ is not None else None -class RDSAPEnergyReport(SiteNotesExtractor): - def __init__(self, data_list): - super().__init__(data_list) - self.type = ReportType.RDSAP_ENERGY_REPORT - self.master_obj = self.setup_energy_report() - def setup_energy_report(self): - pass class ECOConditionReport(SiteNotesExtractor): def __init__(self, data_list): @@ -1597,4 +1590,20 @@ class QuidosSiteNotesExtractor(SiteNotesExtractor): main_gas_avalible=True if dict_.get("main_gas_available", "NO").upper() == "YES" else False, ) - \ No newline at end of file +class EnergyPerformanceReportWithData(SiteNotesExtractor): + def __init__(self, data_list): + super().__init__(data_list) + self.type = ReportType.ENERGY_PERFORMANCE_REPORT_WITH_DATA + self.master_obj = self.setup() + + def setup(self): + pass + +class EnergyPerformanceReportSummaryInformation(SiteNotesExtractor): + def __init__(self, data_list): + super().__init__(data_list) + self.type = ReportType.ENERGY_PERFORMANCE_REPORT_SUMMARY_INFORMATION + self.master_obj = self.setup() + + def setup(self): + pass \ No newline at end of file diff --git a/etl/hubSpotClient/types.py b/etl/hubSpotClient/types.py index dd97011..e004243 100644 --- a/etl/hubSpotClient/types.py +++ b/etl/hubSpotClient/types.py @@ -80,8 +80,8 @@ class SubmissionInfoFromDeal(BaseModel): if sdp.condition_report is None: missing_items.append("Condition Report") - if sdp.energy_report is None: - missing_items.append("Energy Report PDF") + if sdp.epr_summary_information is None: + missing_items.append("EPR Energy report with data is missing") if sdp.rd_sap_xml is None: missing_items.append("RDSAP XML") @@ -89,6 +89,9 @@ class SubmissionInfoFromDeal(BaseModel): if sdp.lig_sap_xml is None: missing_items.append("LIG SAP XML") + if sdp.epr_summary_information is None: + missing_items.append("EPR Summary information is missing") + if missing_items: raise ValueError(f"Missing required items: {', '.join(missing_items)}") diff --git a/etl/surveyedData/surveryedData.py b/etl/surveyedData/surveryedData.py index 485d617..f75b112 100644 --- a/etl/surveyedData/surveryedData.py +++ b/etl/surveyedData/surveryedData.py @@ -41,7 +41,8 @@ class surveyedDataProcessor(): self.csr = None self.condition_report = None self.hubspot_deal_id = None - self.energy_report = None + self.epr_with_data = None + self.epr_summary_information = None self.full_sap_xml = None self.lig_sap_xml = None self.rd_sap_xml = None @@ -64,8 +65,11 @@ class surveyedDataProcessor(): self.condition_report = pdf.get_reader() elif pdf.type == ReportType.ECO_CONDITION_REPORT: self.condition_report = pdf.get_reader() - elif pdf.type == ReportType.RDSAP_ENERGY_REPORT: - self.energy_report = pdf.get_reader() + elif pdf.type == ReportType.ENERGY_PERFORMANCE_REPORT_WITH_DATA: + self.epr_with_data = pdf.get_reader() + elif pdf.type == ReportType.ENERGY_PERFORMANCE_REPORT_SUMMARY_INFORMATION: + self.epr_summary_information = pdf.get_reader() + elif file.lower().endswith('.xml'): xml = xmlReader(file) if xml: