diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index bd916494..976a953f 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -43,6 +43,42 @@ def extract_summary_report(pdf_path): return data +def extract_retrofit_assessment_folder(retrofit_folder_path): + """ + Handles extraction from a retrofit assessment folder if it exists and has content. + """ + retrofit_files = os.listdir(retrofit_folder_path) + + # Find the summary report in the retrofit folder + summary_report = next( + (name for name in retrofit_files if "summary" in name.lower() and name.endswith(".pdf")), None + ) + + if summary_report: + pdf_path = os.path.join(retrofit_folder_path, summary_report) + return extract_summary_report(pdf_path) + + return None # If no relevant PDF is found + + +def extract_from_survey_folder_files(survey_folder_path): + """ + Handles extraction directly from files in the survey folder when no 'retrofit assessment' folder exists. + """ + survey_files = os.listdir(survey_folder_path) + + # Look for a summary report directly in the survey folder + summary_report = next( + (name for name in survey_files if "summary" in name.lower() and name.endswith(".pdf")), None + ) + + if summary_report: + pdf_path = os.path.join(survey_folder_path, summary_report) + return extract_summary_report(pdf_path) + + return None # If no relevant PDF is found + + def main(): """ This code prepares the data for the Warm Homes: Social Housing Fund Wave 3, for Stonewater. @@ -52,40 +88,38 @@ def main(): extracted_data = [] for survey_folder in survey_folders: + survey_folder_path = os.path.join(FILE_PATH, survey_folder) + # List the folders inside of the survey folder - survey_subfolders = [name for name in os.listdir(os.path.join(FILE_PATH, survey_folder)) - if os.path.isdir(os.path.join(FILE_PATH, survey_folder, name))] + survey_subfolders = [name for name in os.listdir(survey_folder_path) + if os.path.isdir(os.path.join(survey_folder_path, name))] - if not survey_subfolders: - continue - - # Check for a folder inside of the survey_subfolders containing the phrase "retrofit assessment: - # If it exists, we will use the data from that folder + # Check if there's a "retrofit assessment" folder retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None) - # List contents of the retrofit folder - retrofit_files = os.listdir(os.path.join(FILE_PATH, survey_folder, retrofit_folder)) + # If retrofit assessment folder exists, check if it has content + if retrofit_folder: + retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder) + if os.listdir(retrofit_folder_path): # If not empty + summary_data = extract_retrofit_assessment_folder(retrofit_folder_path) + if summary_data: + summary_data = { + "survey_folder": survey_folder, + **summary_data + } + extracted_data.append(summary_data) + continue - if not retrofit_files: - continue - - # We now look for specific files: - # 1) Check the summary report.- the title will contain the word "summary" (lowercase) and the file extension is - # .pdf - summary_report = next( - (name for name in retrofit_files if "summary" in name.lower() and name.endswith(".pdf")), None - ) - if summary_report is not None: - pdf_path = os.path.join(FILE_PATH, survey_folder, retrofit_folder, summary_report) - summary_data = extract_summary_report(pdf_path) + # If no retrofit folder or it was empty, check files in survey_folder + summary_data = extract_from_survey_folder_files(survey_folder_path) + if summary_data: summary_data = { "survey_folder": survey_folder, **summary_data } extracted_data.append(summary_data) - continue - raise NotImplementedError("IMPLEMENT ME!") + print("Extracted Data:", extracted_data) if __name__ == "__main__":