From 3cd9670d1aa49b7b71b9fa59739b82ab2b9e62dd Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 28 Nov 2024 15:53:48 +0000 Subject: [PATCH] adding file detection for elmhurst project handover --- etl/lodgement/app.py | 29 ++++++++++++++++++++++++++++- utils/file_data_extraction.py | 21 +++++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/etl/lodgement/app.py b/etl/lodgement/app.py index c75ece4c..2bdeb3d7 100644 --- a/etl/lodgement/app.py +++ b/etl/lodgement/app.py @@ -98,7 +98,8 @@ def handler(): "osmosis condition report": OsmosisConditionReportParser, "elmhurst evidence report": None, "full sap xml": FullSapParser, - "pulse air permeability": file_extraction_tools.PulseAirPermeabilityExtractor + "pulse air permeability": file_extraction_tools.PulseAirPermeabilityExtractor, + "elmhurst project handover": file_extraction_tools.ElmhurstProjectHandoverExtractor, } extracted = [] @@ -159,6 +160,32 @@ def handler(): extracted_contents[report_type] = file_extractor(filepath).extract() + lodgement_folder = os.path.join( + property_folder_path, [f for f in subfolders if "TrustMark Lodgement" in f][0] + ) + # Within the lodgement folder, we want the required documents sub-folder + lodgement_subfolders = [ + file for file in os.listdir(lodgement_folder) if os.path.isdir(os.path.join(lodgement_folder, file)) + ] + required_documents_folder = os.path.join( + lodgement_folder, [f for f in lodgement_subfolders if "required documents" in f.lower()][0] + ) + # List the contents + required_documents_contents = [ + file for file in os.listdir(required_documents_folder) if + os.path.isfile(os.path.join(required_documents_folder, file)) + ] + + # There are only a few file types we actually want to process in here for the moment + for filename in required_documents_contents: + filepath = os.path.join(required_documents_folder, filename) + if file_extraction_tools.is_pdf(filepath): + report_type = file_extraction_tools.detect_pdf_report_type(pdf_path=filepath) + if report_type != "elmhurst project handover": + continue + blah + file_extractor = extractors[report_type] + output_row_data = output_template.copy() # dict_keys([ 'City/County', 'District/Town', diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py index c60f01b4..ef02e7f0 100644 --- a/utils/file_data_extraction.py +++ b/utils/file_data_extraction.py @@ -50,6 +50,13 @@ def is_pulse_air_permeability(text): return text.startswith("Air Permeability Test Report @O PULSE") +def is_elmhurst_project_handover(text): + """ + Determines if the provided text indicates that the PDF is an Elmhurst Project Handover Report. + """ + return "Retrofit_Project_Handover" in text or "Retrofit Project Handover" in text + + def detect_pdf_report_type(pdf_path): """ Detects the type of report based on content or filename. @@ -78,6 +85,8 @@ def detect_pdf_report_type(pdf_path): return "elmhurst evidence report" elif is_pulse_air_permeability(first_page_text): return "pulse air permeability" + elif is_elmhurst_project_handover(first_page_text): + return "elmhurst project handover" return None @@ -1057,3 +1066,15 @@ class PulseAirPermeabilityExtractor: } return data + + +class ElmhurstProjectHandoverExtractor: + """ + A utility class for extracting specific data from The Elmhurst Project Handover document + """ + + def __init__(self, file_path): + self.file_path = file_path + + def extract(self): + pass