adding additional catch for summary report

This commit is contained in:
Khalim Conn-Kowlessar 2024-10-28 14:20:33 +00:00
parent 70d02075cf
commit 371f17f87e
2 changed files with 14 additions and 1 deletions

View file

@ -2,6 +2,7 @@ import os
import PyPDF2
import re
import pandas as pd
from tqdm import tqdm
FILE_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 3 Surveys"
@ -137,6 +138,10 @@ def extract_retrofit_assessment_folder(retrofit_folder_path):
elif "summary" in pdf_file.lower():
# Treat this as a Summary Report
return extract_summary_report(pdf_path)
elif is_summary_report(first_page_text):
# other ways to detect a summary report
# Treat this as a Summary Report
return extract_summary_report(pdf_path)
# If no relevant PDF is found, raise an exception
raise FileNotFoundError("No valid report (EPR or Summary) found in the retrofit assessment folder.")
@ -150,6 +155,13 @@ def is_energy_report(text):
return text.startswith("ENERGY REPORT")
def is_summary_report(text):
"""
Determines if the provided text indicates that the PDF is a Summary Report.
"""
return text.startswith("Summary Information")
def extract_from_survey_folder_files(survey_folder_path):
"""
Handles extraction directly from files in the survey folder when no 'retrofit assessment' folder exists.
@ -184,7 +196,7 @@ def main():
survey_folders = [name for name in os.listdir(FILE_PATH) if os.path.isdir(os.path.join(FILE_PATH, name))]
extracted_data = []
for survey_folder in survey_folders:
for survey_folder in tqdm(survey_folders):
survey_folder_path = os.path.join(FILE_PATH, survey_folder)
# List the folders inside of the survey folder

View file

@ -1,2 +1,3 @@
PyPDF2
pandas
tqdm