adding file detection for elmhurst project handover

This commit is contained in:
Khalim Conn-Kowlessar 2024-11-28 15:53:48 +00:00
parent 2cfc881044
commit 3cd9670d1a
2 changed files with 49 additions and 1 deletions

View file

@ -98,7 +98,8 @@ def handler():
"osmosis condition report": OsmosisConditionReportParser,
"elmhurst evidence report": None,
"full sap xml": FullSapParser,
"pulse air permeability": file_extraction_tools.PulseAirPermeabilityExtractor
"pulse air permeability": file_extraction_tools.PulseAirPermeabilityExtractor,
"elmhurst project handover": file_extraction_tools.ElmhurstProjectHandoverExtractor,
}
extracted = []
@ -159,6 +160,32 @@ def handler():
extracted_contents[report_type] = file_extractor(filepath).extract()
lodgement_folder = os.path.join(
property_folder_path, [f for f in subfolders if "TrustMark Lodgement" in f][0]
)
# Within the lodgement folder, we want the required documents sub-folder
lodgement_subfolders = [
file for file in os.listdir(lodgement_folder) if os.path.isdir(os.path.join(lodgement_folder, file))
]
required_documents_folder = os.path.join(
lodgement_folder, [f for f in lodgement_subfolders if "required documents" in f.lower()][0]
)
# List the contents
required_documents_contents = [
file for file in os.listdir(required_documents_folder) if
os.path.isfile(os.path.join(required_documents_folder, file))
]
# There are only a few file types we actually want to process in here for the moment
for filename in required_documents_contents:
filepath = os.path.join(required_documents_folder, filename)
if file_extraction_tools.is_pdf(filepath):
report_type = file_extraction_tools.detect_pdf_report_type(pdf_path=filepath)
if report_type != "elmhurst project handover":
continue
blah
file_extractor = extractors[report_type]
output_row_data = output_template.copy()
# dict_keys([ 'City/County', 'District/Town',

View file

@ -50,6 +50,13 @@ def is_pulse_air_permeability(text):
return text.startswith("Air Permeability Test Report @O PULSE")
def is_elmhurst_project_handover(text):
"""
Determines if the provided text indicates that the PDF is an Elmhurst Project Handover Report.
"""
return "Retrofit_Project_Handover" in text or "Retrofit Project Handover" in text
def detect_pdf_report_type(pdf_path):
"""
Detects the type of report based on content or filename.
@ -78,6 +85,8 @@ def detect_pdf_report_type(pdf_path):
return "elmhurst evidence report"
elif is_pulse_air_permeability(first_page_text):
return "pulse air permeability"
elif is_elmhurst_project_handover(first_page_text):
return "elmhurst project handover"
return None
@ -1057,3 +1066,15 @@ class PulseAirPermeabilityExtractor:
}
return data
class ElmhurstProjectHandoverExtractor:
"""
A utility class for extracting specific data from The Elmhurst Project Handover document
"""
def __init__(self, file_path):
self.file_path = file_path
def extract(self):
pass