diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index dc71d449..30a23e86 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -118,30 +118,15 @@ def extract_epr(pdf_path): return data -def extract_retrofit_assessment_folder(retrofit_folder_path): +def extract_retrofit_pdfs(data_folder_path): """ - Handles extraction from a retrofit assessment folder if it exists and has content. + Handles extraction from a retrofit data folder if it exists and has content. """ - retrofit_files = [f for f in os.listdir(retrofit_folder_path) if f.endswith(".pdf")] + retrofit_files = [f for f in os.listdir(data_folder_path) if f.endswith(".pdf")] for pdf_file in retrofit_files: - pdf_path = os.path.join(retrofit_folder_path, pdf_file) - - # Attempt to read the first page of the PDF to determine the report type - with open(pdf_path, "rb") as file: - reader = PyPDF2.PdfReader(file) - first_page_text = reader.pages[0].extract_text() if reader.pages else "" - - if is_energy_report(first_page_text): - # Treat this as an Energy Report - return extract_epr(pdf_path) - elif "summary" in pdf_file.lower(): - # Treat this as a Summary Report - return extract_summary_report(pdf_path) - elif is_summary_report(first_page_text): - # other ways to detect a summary report - # Treat this as a Summary Report - return extract_summary_report(pdf_path) + pdf_path = os.path.join(data_folder_path, pdf_file) + return detect_and_parse_report(pdf_path, pdf_file) # If no relevant PDF is found, raise an exception raise FileNotFoundError("No valid report (EPR or Summary) found in the retrofit assessment folder.") @@ -162,30 +147,26 @@ def is_summary_report(text): return text.startswith("Summary Information") -def extract_from_survey_folder_files(survey_folder_path): +def detect_and_parse_report(pdf_path, pdf_file): """ - Handles extraction directly from files in the survey folder when no 'retrofit assessment' folder exists. + Detects the type of report and extracts the relevant data. + :param pdf_path: String path to the PDF file + :param pdf_file: String name of the PDF file + :return: """ - survey_files = [f for f in os.listdir(survey_folder_path) if f.endswith(".pdf")] + # Attempt to read the first page of the PDF to determine type + with open(pdf_path, "rb") as file: + reader = PyPDF2.PdfReader(file) + first_page_text = reader.pages[0].extract_text() if reader.pages else "" - for pdf_file in survey_files: - pdf_path = os.path.join(survey_folder_path, pdf_file) - - # Attempt to read the first page of the PDF to determine type - with open(pdf_path, "rb") as file: - reader = PyPDF2.PdfReader(file) - first_page_text = reader.pages[0].extract_text() if reader.pages else "" - - if is_energy_report(first_page_text): - # Treat this as an Energy Report - return extract_epr(pdf_path) - elif "summary" in pdf_file.lower(): - # Treat this as a Summary Report - return extract_summary_report(pdf_path) - else: - raise NotImplementedError("Implement me") - - return None + if is_energy_report(first_page_text): + # Treat this as an Energy Report + return extract_epr(pdf_path) + elif "summary" in pdf_file.lower(): + # Treat this as a Summary Report + return extract_summary_report(pdf_path) + else: + raise NotImplementedError("Implement me") def main(): @@ -210,7 +191,7 @@ def main(): if retrofit_folder: retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder) if os.listdir(retrofit_folder_path): # If not empty - summary_data = extract_retrofit_assessment_folder(retrofit_folder_path) + summary_data = extract_retrofit_pdfs(retrofit_folder_path) if summary_data: summary_data = { "survey_folder": survey_folder, @@ -219,7 +200,7 @@ def main(): extracted_data.append(summary_data) continue # If no retrofit folder or it was empty, check files in survey_folder - summary_data = extract_from_survey_folder_files(survey_folder_path) + summary_data = extract_retrofit_pdfs(survey_folder_path) if summary_data: summary_data = { "survey_folder": survey_folder,