diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 14e50460..dc71d449 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -2,6 +2,7 @@ import os import PyPDF2 import re import pandas as pd +from tqdm import tqdm FILE_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 3 Surveys" @@ -137,6 +138,10 @@ def extract_retrofit_assessment_folder(retrofit_folder_path): elif "summary" in pdf_file.lower(): # Treat this as a Summary Report return extract_summary_report(pdf_path) + elif is_summary_report(first_page_text): + # other ways to detect a summary report + # Treat this as a Summary Report + return extract_summary_report(pdf_path) # If no relevant PDF is found, raise an exception raise FileNotFoundError("No valid report (EPR or Summary) found in the retrofit assessment folder.") @@ -150,6 +155,13 @@ def is_energy_report(text): return text.startswith("ENERGY REPORT") +def is_summary_report(text): + """ + Determines if the provided text indicates that the PDF is a Summary Report. + """ + return text.startswith("Summary Information") + + def extract_from_survey_folder_files(survey_folder_path): """ Handles extraction directly from files in the survey folder when no 'retrofit assessment' folder exists. @@ -184,7 +196,7 @@ def main(): survey_folders = [name for name in os.listdir(FILE_PATH) if os.path.isdir(os.path.join(FILE_PATH, name))] extracted_data = [] - for survey_folder in survey_folders: + for survey_folder in tqdm(survey_folders): survey_folder_path = os.path.join(FILE_PATH, survey_folder) # List the folders inside of the survey folder diff --git a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt index 2cabb047..70bec3cc 100644 --- a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt +++ b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt @@ -1,2 +1,3 @@ PyPDF2 pandas +tqdm