From 4075fbaa3cc95379f29436dd0755c2ade14903fa Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 14 Jul 2025 10:08:58 +0000 Subject: [PATCH] survyed sign off --- ...ml => hubspot_surveyed_needs_sign_off.yml} | 6 +-- etl/daily_script.py | 2 +- etl/development.py | 2 +- etl/fileReader/reportType.py | 3 ++ etl/fileReader/sitenotes.py | 2 +- etl/fileReader/xmlReader.py | 43 +++++++++++++++++++ etl/hubSpotClient/types.py | 20 ++++++++- ....py => hubspot_surveyed_needs_sign_off.py} | 9 +++- etl/jjc_old_lewis_manual_way_.py | 2 +- etl/models/topLevel.py | 2 +- etl/scis_invoice.py | 2 +- etl/sgec_invoice.py | 2 +- etl/surveyedData/surveryedData.py | 20 +++++++-- etl/validator/validator.py | 4 +- 14 files changed, 99 insertions(+), 20 deletions(-) rename .github/workflows/{hubspot_deal_notes.yml => hubspot_surveyed_needs_sign_off.yml} (79%) create mode 100644 etl/fileReader/xmlReader.py rename etl/{hubspot_verification_to_db_load.py => hubspot_surveyed_needs_sign_off.py} (71%) diff --git a/.github/workflows/hubspot_deal_notes.yml b/.github/workflows/hubspot_surveyed_needs_sign_off.yml similarity index 79% rename from .github/workflows/hubspot_deal_notes.yml rename to .github/workflows/hubspot_surveyed_needs_sign_off.yml index ce80afe..2b036d6 100644 --- a/.github/workflows/hubspot_deal_notes.yml +++ b/.github/workflows/hubspot_surveyed_needs_sign_off.yml @@ -1,7 +1,7 @@ -name: Deal Notes From HubSpot Scraper +name: Daily Surved on: schedule: - - cron: '0 19 * * 0' + - cron: '0 17 * * 1-5' workflow_dispatch: jobs: @@ -24,6 +24,6 @@ jobs: run: | pwd ls -la - poetry run python etl/dimitra_hubspot_notes_gather.py + poetry run python etl/hubspot_surveyed_needs_sign_off.py env: PYTHONPATH: ${{ github.workspace }} \ No newline at end of file diff --git a/etl/daily_script.py b/etl/daily_script.py index b2c76d9..1689167 100644 --- a/etl/daily_script.py +++ b/etl/daily_script.py @@ -1,5 +1,5 @@ import os -from pdfReader.pdfReaderToText import pdfReaderToText +from fileReader.pdfReaderToText import pdfReaderToText from etl.scraper.scraper import SharePointScraper, SharePointInstaller, WEEK_COMMENCING from pprint import pprint, pformat import logging diff --git a/etl/development.py b/etl/development.py index f70c68d..9319e1c 100644 --- a/etl/development.py +++ b/etl/development.py @@ -1,6 +1,6 @@ from etl.scraper.scraper import SharePointScraper, SharePointInstaller from pprint import pformat -from etl.pdfReader.pdfReaderToText import pdfReaderToText +from etl.fileReader.pdfReaderToText import pdfReaderToText from etl.surveyedData.surveryedData import surveyedDataProcessor import pandas as pd diff --git a/etl/fileReader/reportType.py b/etl/fileReader/reportType.py index 1db2efb..07ac12e 100644 --- a/etl/fileReader/reportType.py +++ b/etl/fileReader/reportType.py @@ -10,3 +10,6 @@ class ReportType(Enum): ECO_CONDITION_REPORT = "osmosis_condition_pas_2035_report" WARM_HOMES_CONDITION_REPORT = "warm_homes_condition_pas_2035_report" RDSAP_ENERGY_REPORT = "rdsap_energy_report" + LIG_XML = "lodgement_xml_needed_for_lodgement_to_like_trademark" + RDSAP_XML = "reduce_xml_needed_to_generate_full_sap_xml" + FULLSAP_XML = "full_xml_needed_for_co_ordination" diff --git a/etl/fileReader/sitenotes.py b/etl/fileReader/sitenotes.py index 6026d9b..f46726c 100644 --- a/etl/fileReader/sitenotes.py +++ b/etl/fileReader/sitenotes.py @@ -1,4 +1,4 @@ -from etl.pdfReader.reportType import ReportType +from etl.fileReader.reportType import ReportType from etl.transform.preSiteNoteTypes import ( CompanyInfo, PreSiteNotesSummaryInfo, AssessorInfo, PropertyDescription, PropertyDetail, Dimension, diff --git a/etl/fileReader/xmlReader.py b/etl/fileReader/xmlReader.py new file mode 100644 index 0000000..d49b9d4 --- /dev/null +++ b/etl/fileReader/xmlReader.py @@ -0,0 +1,43 @@ +from etl.utils.logger import Logger +import logging +from xml.dom.minidom import parse +import os +from etl.fileReader.reportType import ReportType + +class xmlReader(): + def __init__(self, file_path): + self.source_path = file_path + self.logger = Logger(name='xmlReader', level=logging.INFO).get_logger() + self.xml_obj = None + self.type = None + self.get_xml_obj() + + + def get_xml_obj(self): + try: + if not os.path.exists(self.source_path): + self.logger.error(f"File not found: {self.source_path}") + return None + + with open(self.source_path, 'r', encoding='utf-8') as file: + self.xml_obj = parse(file) + self.get_type() + return self.xml_obj + + except Exception as e: + self.logger.error(f"Failed to parse XML file {self.source_path}: {e}") + self.xml_obj = None + return self.xml_obj + + def get_type(self): + xmlHeaderName = self.xml_obj.documentElement.tagName + xmlHeaderName = xmlHeaderName.lower() + if xmlHeaderName == 'RdSap-Report'.lower(): + self.type = ReportType.LIG_XML + elif xmlHeaderName == "SurveyRec".lower(): + self.type = ReportType.RDSAP_XML + elif xmlHeaderName == "ImportExportRecord".lower(): + self.type = ReportType.FULLSAP_XML + else: + pass + return self.type diff --git a/etl/hubSpotClient/types.py b/etl/hubSpotClient/types.py index 84ef0d5..dd97011 100644 --- a/etl/hubSpotClient/types.py +++ b/etl/hubSpotClient/types.py @@ -71,11 +71,27 @@ class SubmissionInfoFromDeal(BaseModel): # download files in url and check files are there: try: + files = sp.download_files_from_path(path) print(files) sdp = surveyedDataProcessor("fake address", files) - assert sdp.condition_report is not None, "Condition Report is missing" - assert sdp.energy_report is not None, "Energy Report pdf is missing" + missing_items = [] + + if sdp.condition_report is None: + missing_items.append("Condition Report") + + if sdp.energy_report is None: + missing_items.append("Energy Report PDF") + + if sdp.rd_sap_xml is None: + missing_items.append("RDSAP XML") + + if sdp.lig_sap_xml is None: + missing_items.append("LIG SAP XML") + + if missing_items: + raise ValueError(f"Missing required items: {', '.join(missing_items)}") + except Exception as e: raise ValueError(str(e)) diff --git a/etl/hubspot_verification_to_db_load.py b/etl/hubspot_surveyed_needs_sign_off.py similarity index 71% rename from etl/hubspot_verification_to_db_load.py rename to etl/hubspot_surveyed_needs_sign_off.py index 9b0f9e1..a8eeabc 100644 --- a/etl/hubspot_verification_to_db_load.py +++ b/etl/hubspot_surveyed_needs_sign_off.py @@ -1,3 +1,7 @@ +""" +This is the script that runs when we are at the 'surveyed-needs sign off' stage within hubspot +""" + import os from pprint import pprint @@ -15,6 +19,7 @@ hubspotClient = HubSpotClient() deals = hubspotClient.get_deals_from_deal_stage(DealStage.SURVEYED_COMPLETE_NEEDS_SIGN_OFF) +for deal in deals: + hubspotClient.move_deals_to_different_stage([deal.deal_id], DealStage.SURVEYED_COMPLETED_SIGNED_OFF.value) -# TODO sanity address check -# TODO load \ No newline at end of file +# TODO load when we are at 'ready to co-ordination' - script! \ No newline at end of file diff --git a/etl/jjc_old_lewis_manual_way_.py b/etl/jjc_old_lewis_manual_way_.py index ff12e94..027b515 100644 --- a/etl/jjc_old_lewis_manual_way_.py +++ b/etl/jjc_old_lewis_manual_way_.py @@ -5,7 +5,7 @@ os.environ["SHAREPOINT_TENANT_ID"] = "c3f7519c-2719-4547-af04-6da6cbfd8f8f" os.environ["JJC_SERVICE_SHAREPOINT_ID"] = "7fdd0485-bbf3-4b29-b30f-98c81c2a6284" from etl.scraper.scraper import SharePointScraper, SharePointInstaller, WEEK_COMMENCING from pprint import pformat -from etl.pdfReader.pdfReaderToText import pdfReaderToText +from etl.fileReader.pdfReaderToText import pdfReaderToText from etl.surveyedData.surveryedData import surveyedDataProcessor import pandas as pd import math diff --git a/etl/models/topLevel.py b/etl/models/topLevel.py index a2e2326..784d3e5 100644 --- a/etl/models/topLevel.py +++ b/etl/models/topLevel.py @@ -5,7 +5,7 @@ from datetime import datetime from pydantic import EmailStr from sqlalchemy import Column from sqlalchemy.dialects.postgresql import UUID -from etl.pdfReader.reportType import ReportType +from etl.fileReader.reportType import ReportType class BaseModel(SQLModel): id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True) diff --git a/etl/scis_invoice.py b/etl/scis_invoice.py index fc9d481..9a479c6 100644 --- a/etl/scis_invoice.py +++ b/etl/scis_invoice.py @@ -1,6 +1,6 @@ from etl.scraper.scraper import SharePointScraper, SharePointInstaller, WEEK_COMMENCING from pprint import pformat -from etl.pdfReader.pdfReaderToText import pdfReaderToText +from etl.fileReader.pdfReaderToText import pdfReaderToText from etl.surveyedData.surveryedData import surveyedDataProcessor import pandas as pd diff --git a/etl/sgec_invoice.py b/etl/sgec_invoice.py index fc9d481..9a479c6 100644 --- a/etl/sgec_invoice.py +++ b/etl/sgec_invoice.py @@ -1,6 +1,6 @@ from etl.scraper.scraper import SharePointScraper, SharePointInstaller, WEEK_COMMENCING from pprint import pformat -from etl.pdfReader.pdfReaderToText import pdfReaderToText +from etl.fileReader.pdfReaderToText import pdfReaderToText from etl.surveyedData.surveryedData import surveyedDataProcessor import pandas as pd diff --git a/etl/surveyedData/surveryedData.py b/etl/surveyedData/surveryedData.py index e4299ae..485d617 100644 --- a/etl/surveyedData/surveryedData.py +++ b/etl/surveyedData/surveryedData.py @@ -1,5 +1,6 @@ -from etl.pdfReader.pdfReaderToText import pdfReaderToText -from etl.pdfReader.reportType import ReportType +from etl.fileReader.pdfReaderToText import pdfReaderToText +from etl.fileReader.xmlReader import xmlReader +from etl.fileReader.reportType import ReportType import math from xml.dom.minidom import parseString from etl.models.preSiteNoteTypes import ( @@ -41,6 +42,10 @@ class surveyedDataProcessor(): self.condition_report = None self.hubspot_deal_id = None self.energy_report = None + self.full_sap_xml = None + self.lig_sap_xml = None + self.rd_sap_xml = None + self.identify_files() @@ -62,8 +67,15 @@ class surveyedDataProcessor(): elif pdf.type == ReportType.RDSAP_ENERGY_REPORT: self.energy_report = pdf.get_reader() elif file.lower().endswith('.xml'): - print(f"identified an xml file {file.lower()}") - pass + xml = xmlReader(file) + if xml: + if xml.type is ReportType.FULLSAP_XML: + self.full_sap_xml = xml.xml_obj + + elif xml.type is ReportType.LIG_XML: + self.lig_sap_xml = xml.xml_obj + elif xml.type is ReportType.RDSAP_XML: + self.rd_sap_xml = xml.xml_obj def load_condition_report(self, db_session): general_information = self.load_general_information_from_condition_report(db_session) diff --git a/etl/validator/validator.py b/etl/validator/validator.py index 4a5ce3a..2dd88a5 100644 --- a/etl/validator/validator.py +++ b/etl/validator/validator.py @@ -2,8 +2,8 @@ import os import logging from etl.utils.logger import Logger import re -from etl.pdfReader.pdfReaderToText import pdfReaderToText -from etl.pdfReader.reportType import ReportType +from etl.fileReader.pdfReaderToText import pdfReaderToText +from etl.fileReader.reportType import ReportType class DomnaSharePointValidator():