diff --git a/etl/pdfReader/__init__.py b/etl/fileReader/__init__.py
similarity index 100%
rename from etl/pdfReader/__init__.py
rename to etl/fileReader/__init__.py
diff --git a/etl/pdfReader/pdfReaderToText.py b/etl/fileReader/pdfReaderToText.py
similarity index 69%
rename from etl/pdfReader/pdfReaderToText.py
rename to etl/fileReader/pdfReaderToText.py
index 34e37ad..bc9643f 100644
--- a/etl/pdfReader/pdfReaderToText.py
+++ b/etl/fileReader/pdfReaderToText.py
@@ -1,8 +1,8 @@
from etl.utils.logger import Logger
import logging
import pymupdf
-from etl.pdfReader.sitenotes import QuidosSiteNotesExtractor, CSR, ConditionReport
-from etl.pdfReader.reportType import ReportType
+from etl.fileReader.sitenotes import QuidosSiteNotesExtractor, CSR, WarmHomesConditionReport, ECOConditionReport, RDSAPEnergyReport
+from etl.fileReader.reportType import ReportType
class pdfReaderToText():
@@ -24,6 +24,7 @@ class pdfReaderToText():
self.all_text += text
self.text_list = self.all_text.split('\n')
+ print(self.text_list)
def get_list_of_text(self):
return self.text_list
@@ -41,7 +42,11 @@ class pdfReaderToText():
elif "Chartered Surveyor Report: Recommending Extraction of Defective Cavity Wall Insulation " in self.text_list:
self.type = ReportType.CHARTED_SURVEYOR_REPORT
elif "Osmosis ACD NEW PAS 2035 Condition Report".lower() in self.text_list[0].lower():
- self.type = ReportType.OSMOSIS_CONDITION_PAS_2035_REPORT
+ self.type = ReportType.WARM_HOMES_CONDITION_REPORT
+ elif "Domna NEW PAS 2035 ECO Condition Report".lower() in self.text_list[0].lower():
+ self.type = ReportType.ECO_CONDITION_REPORT
+ elif "ENERGY REPORT".lower() == self.text_list[0].lower():
+ self.type = ReportType.RDSAP_ENERGY_REPORT
else:
pass
return self.type
@@ -53,6 +58,10 @@ class pdfReaderToText():
return QuidosSiteNotesExtractor(self.text_list)
elif self.type == ReportType.CHARTED_SURVEYOR_REPORT:
return CSR(self.text_list)
- elif self.type == ReportType.OSMOSIS_CONDITION_PAS_2035_REPORT:
- return ConditionReport(self.text_list)
+ elif self.type == ReportType.WARM_HOMES_CONDITION_REPORT:
+ return WarmHomesConditionReport(self.text_list)
+ elif self.type == ReportType.ECO_CONDITION_REPORT:
+ return ECOConditionReport(self.text_list)
+ elif self.type == ReportType.RDSAP_ENERGY_REPORT:
+ return RDSAPEnergyReport(self.text_list)
\ No newline at end of file
diff --git a/etl/pdfReader/reportType.py b/etl/fileReader/reportType.py
similarity index 65%
rename from etl/pdfReader/reportType.py
rename to etl/fileReader/reportType.py
index a94847a..1db2efb 100644
--- a/etl/pdfReader/reportType.py
+++ b/etl/fileReader/reportType.py
@@ -7,4 +7,6 @@ class ReportType(Enum):
ENERGY_PERFORMANCE_REPORT = "energy_performance_report"
U_VALUE_CALCULATOR_REPORT = "u_value_calculator_report"
OVERWRITING_U_VALUE_DECLARATION_FORM = "overwriting_u_value_declaration_form"
- OSMOSIS_CONDITION_PAS_2035_REPORT = "osmosis_condition_pas_2035_report"
+ ECO_CONDITION_REPORT = "osmosis_condition_pas_2035_report"
+ WARM_HOMES_CONDITION_REPORT = "warm_homes_condition_pas_2035_report"
+ RDSAP_ENERGY_REPORT = "rdsap_energy_report"
diff --git a/etl/pdfReader/sitenotes.py b/etl/fileReader/sitenotes.py
similarity index 99%
rename from etl/pdfReader/sitenotes.py
rename to etl/fileReader/sitenotes.py
index 5bb3932..6026d9b 100644
--- a/etl/pdfReader/sitenotes.py
+++ b/etl/fileReader/sitenotes.py
@@ -88,11 +88,30 @@ class CSR(SiteNotesExtractor):
type=dict_.get('detailed_description_of_existing_cavity_wall_insulation_', "")
) if dict_ is not None else None
-
-class ConditionReport(SiteNotesExtractor):
+class RDSAPEnergyReport(SiteNotesExtractor):
def __init__(self, data_list):
super().__init__(data_list)
- self.type = ReportType.OSMOSIS_CONDITION_PAS_2035_REPORT
+ self.type = ReportType.RDSAP_ENERGY_REPORT
+ self.master_obj = self.setup_energy_report()
+
+ def setup_energy_report(self):
+ pass
+
+class ECOConditionReport(SiteNotesExtractor):
+ def __init__(self, data_list):
+ super().__init__(data_list)
+ self.type = ReportType.ECO_CONDITION_REPORT
+ self.master_obj = self.setup_condition_report()
+
+ def setup_condition_report(self):
+ pass
+
+
+
+class WarmHomesConditionReport(SiteNotesExtractor):
+ def __init__(self, data_list):
+ super().__init__(data_list)
+ self.type = ReportType.WARM_HOMES_CONDITION_REPORT
self.master_obj = self.setup_condition_report()
def setup_condition_report(self):
diff --git a/etl/hubSpotClient/hubspot.py b/etl/hubSpotClient/hubspot.py
index c2d6af6..3b4913e 100644
--- a/etl/hubSpotClient/hubspot.py
+++ b/etl/hubSpotClient/hubspot.py
@@ -81,7 +81,7 @@ class HubSpotClient():
def get_domna_and_landlord_id(self, deals_id):
data = self.get_listings_from_deals_id(deals_id)
- return data.properties['domna_property_id'], data.properties['owner_property_id'], data.properties['national_uprn']
+ return data.properties['domna_property_id'], data.properties['owner_property_id'], data.properties.get('national_uprn', '') or ''
def get_notes_from_deals_id(self, deals_id):
from hubspot.crm.objects import PublicObjectSearchRequest
@@ -211,7 +211,7 @@ class HubSpotClient():
try:
deal_name = deal.properties['dealname']
self.logger.info(f"Validating <{deal_name}>")
- input
+ # input(f"Press enter to verfiy <{deal_name}>")
all_deals.append(SubmissionInfoFromDeal(
deal_id= deal.properties["hs_object_id"],
deal_name=deal.properties["dealname"],
@@ -228,7 +228,7 @@ class HubSpotClient():
))
except Exception as e:
def format_error_note(e):
- note_text = "⚠️ Error occurred while verifying deal data:
"
+ note_text = "⚠️ Automated Verification Failed:
"
if hasattr(e, "errors") and callable(e.errors):
note_text += "❌ Validation Errors:
"
@@ -267,7 +267,6 @@ class HubSpotClient():
self.logger.info(f"Deal name <{deal_name}> moving to 'needs additional information'")
self.move_deals_to_different_stage([deal_id], DealStage.NEEDS_ADDITIONAL_INFORMATION_FROM_ASSESSOR.value)
-
return all_deals
def print_all_pipeline_ids(self):
diff --git a/etl/hubSpotClient/types.py b/etl/hubSpotClient/types.py
index e77acde..84ef0d5 100644
--- a/etl/hubSpotClient/types.py
+++ b/etl/hubSpotClient/types.py
@@ -5,6 +5,7 @@ import uuid
from pydantic import Field, field_validator, model_validator
from etl.utils.utils import get_sharepoint_path
from etl.scraper.scraper import SharePointScraper, SharePointInstaller
+from etl.surveyedData.surveryedData import surveyedDataProcessor
@@ -16,7 +17,7 @@ def string_to_installer(installer):
elif installer.upper() == "SCIS":
return SharePointInstaller.SOUTH_COAST_INSULATION
elif installer.upper() == "SGEC":
- return SharePointInstaller.SGEC
+ return SharePointInstaller.JJC
else:
return None
@@ -40,7 +41,7 @@ class SubmissionInfoFromDeal(BaseModel):
submission_folder_path: str = Field(..., min_length=1)
landlord_id: str = Field(..., min_length=1)
domna_id: str = Field(..., min_length=1)
- uprn: str = Field(..., min_length=1)
+ uprn: str
@field_validator('post_sap_score', 'no_of_wet_rooms')
@classmethod
@@ -50,18 +51,37 @@ class SubmissionInfoFromDeal(BaseModel):
return v
@model_validator(mode="after")
- def check_submission_folder_path(self):
- errors = []
-
+ def check_sharepoint_link_and_contents(self):
try:
path = get_sharepoint_path(self.submission_folder_path)
installer = string_to_installer(self.installer)
sp = SharePointScraper(installer)
- files = sp.get_folders_in_path(path)
-
- if "value" in files and len(files["value"]) > 0:
- return self
- raise ValueError(f"SharePoint folder is empty: {self.submission_folder_path}")
-
except Exception as e:
- raise ValueError(f"Error accessing SharePoint path: {self.submission_folder_path}. Error: {str(e)}")
\ No newline at end of file
+ raise ValueError(f"Error accessing SharePoint path: {self.submission_folder_path}. Error: {str(e)}")
+
+ try:
+ # Check if sharepoint link is reachable and has any contents
+ files = sp.get_folders_in_path(path)
+ if "value" in files and len(files["value"]) > 0:
+ pass
+ else:
+ raise ValueError(f"SharePoint folder is empty: {self.submission_folder_path}")
+ except Exception as e:
+ raise ValueError(str(e))
+
+ # download files in url and check files are there:
+ try:
+ files = sp.download_files_from_path(path)
+ print(files)
+ sdp = surveyedDataProcessor("fake address", files)
+ assert sdp.condition_report is not None, "Condition Report is missing"
+ assert sdp.energy_report is not None, "Energy Report pdf is missing"
+ except Exception as e:
+ raise ValueError(str(e))
+
+ return self
+
+
+
+
+
diff --git a/etl/hubspot_verification_to_db_load.py b/etl/hubspot_verification_to_db_load.py
index 5553196..9b0f9e1 100644
--- a/etl/hubspot_verification_to_db_load.py
+++ b/etl/hubspot_verification_to_db_load.py
@@ -13,3 +13,8 @@ os.environ["DATABASE_URL"] = "postgresql://postgres:makingwarmhomes@db:5432/post
hubspotClient = HubSpotClient()
deals = hubspotClient.get_deals_from_deal_stage(DealStage.SURVEYED_COMPLETE_NEEDS_SIGN_OFF)
+
+
+
+# TODO sanity address check
+# TODO load
\ No newline at end of file
diff --git a/etl/scraper/scraper.py b/etl/scraper/scraper.py
index 550e05d..fb289cd 100644
--- a/etl/scraper/scraper.py
+++ b/etl/scraper/scraper.py
@@ -301,13 +301,46 @@ class SharePointScraper():
file_names_to_download.update({file["name"]: file['@microsoft.graph.downloadUrl']})
each_file = []
for file_name, url in file_names_to_download.items():
- self.logger.info(f"Downloading {file_name} from {url}")
+ self.logger.debug(f"Downloading {file_name} from {url}")
content = self.get_file_content(url)
file_path = self.create_temp_file(content, f"{name}/{WEEK_COMMENCING}/{house_ass}/{address}/{file_name}")
each_file.append(file_path)
address_paths.update({address: each_file})
paths.append(address_paths)
return paths
+
+ def download_files_from_path(self, path):
+ """
+ Download all non-media files from a list of root paths.
+
+ Args:
+ root_paths (List[str]): List of full folder paths to start from.
+
+ Returns:
+ List[Dict[str, List[str]]]: A list of dictionaries mapping address folder names to downloaded file paths.
+ """
+ avoid = [".jpg", ".mov", ".JPG", ".heic", ".HEIC", ".png", ".PNG", ".jpeg", ".JPEG", ".mp4", ".MP4"]
+
+ files_info = self.get_folders_in_path(path)
+
+ if 'value' not in files_info:
+ raise RuntimeError(f"Failed to get files from {path}")
+
+ file_names_to_download = {
+ file["name"]: file["@microsoft.graph.downloadUrl"]
+ for file in files_info['value']
+ if 'file' in file and not any(file["name"].endswith(ext) for ext in avoid)
+ }
+
+ downloaded_files = []
+ for file_name, url in file_names_to_download.items():
+ self.logger.info(f"Downloading {file_name} from {url}")
+ content = self.get_file_content(url)
+ file_path = self.create_temp_file(content, f"{path}/{file_name}")
+ downloaded_files.append(file_path)
+
+ return downloaded_files
+
def create_temp_file(self, content, path):
# Ensure the path is under /tmp/
@@ -320,6 +353,6 @@ class SharePointScraper():
with open(path, 'wb+') as temp_file:
temp_file.write(content.getvalue())
- self.logger.info(f"Temporary file created at: {path}")
+ self.logger.debug(f"Temporary file created at: {path}")
return path
\ No newline at end of file
diff --git a/etl/surveyedData/surveryedData.py b/etl/surveyedData/surveryedData.py
index 6e7a404..e4299ae 100644
--- a/etl/surveyedData/surveryedData.py
+++ b/etl/surveyedData/surveryedData.py
@@ -1,6 +1,7 @@
from etl.pdfReader.pdfReaderToText import pdfReaderToText
from etl.pdfReader.reportType import ReportType
import math
+from xml.dom.minidom import parseString
from etl.models.preSiteNoteTypes import (
AssessorInfo, CompanyInfo,
PreSiteNotesSummaryInfo,
@@ -38,21 +39,31 @@ class surveyedDataProcessor():
self.pre_site_note = None
self.csr = None
self.condition_report = None
- self.identify_files()
self.hubspot_deal_id = None
+ self.energy_report = None
+ self.identify_files()
def identify_files(self):
for file in self.files:
- pdf = pdfReaderToText(file)
- if pdf:
- if pdf.type == ReportType.QUIDOS_PRESITE_NOTE:
- self.pre_site_note = pdf.get_reader()
- self.address = self.pre_site_note.survey_information.address
- elif pdf.type == ReportType.CHARTED_SURVEYOR_REPORT:
- self.csr = pdf.get_reader()
- elif pdf.type == ReportType.OSMOSIS_CONDITION_PAS_2035_REPORT:
- self.condition_report = pdf.get_reader()
+
+ if file.lower().endswith('.pdf'):
+ pdf = pdfReaderToText(file)
+ if pdf:
+ if pdf.type == ReportType.QUIDOS_PRESITE_NOTE:
+ self.pre_site_note = pdf.get_reader()
+ self.address = self.pre_site_note.survey_information.address
+ elif pdf.type == ReportType.CHARTED_SURVEYOR_REPORT:
+ self.csr = pdf.get_reader()
+ elif pdf.type == ReportType.WARM_HOMES_CONDITION_REPORT:
+ self.condition_report = pdf.get_reader()
+ elif pdf.type == ReportType.ECO_CONDITION_REPORT:
+ self.condition_report = pdf.get_reader()
+ elif pdf.type == ReportType.RDSAP_ENERGY_REPORT:
+ self.energy_report = pdf.get_reader()
+ elif file.lower().endswith('.xml'):
+ print(f"identified an xml file {file.lower()}")
+ pass
def load_condition_report(self, db_session):
general_information = self.load_general_information_from_condition_report(db_session)
diff --git a/etl/utils/utils.py b/etl/utils/utils.py
index 584381a..ff77015 100644
--- a/etl/utils/utils.py
+++ b/etl/utils/utils.py
@@ -37,4 +37,4 @@ def get_sharepoint_path(url):
raise SharePointURLError(f"The URL does not contain 'id=' parameter. URL: {url}")
except (IndexError, ValueError) as e:
- raise SharePointURLError(f"Error parsing SharePoint URL: {url}. Reason: {e}")
\ No newline at end of file
+ raise SharePointURLError(f"Error with SharePoint URL, please check {url}. Reason: {e}")
\ No newline at end of file