[Crefactoring structure of extraction code

This commit is contained in:
Khalim Conn-Kowlessar 2024-10-28 11:21:54 +00:00
parent 7513e475d3
commit 0332c77098

View file

@ -43,6 +43,42 @@ def extract_summary_report(pdf_path):
return data
def extract_retrofit_assessment_folder(retrofit_folder_path):
"""
Handles extraction from a retrofit assessment folder if it exists and has content.
"""
retrofit_files = os.listdir(retrofit_folder_path)
# Find the summary report in the retrofit folder
summary_report = next(
(name for name in retrofit_files if "summary" in name.lower() and name.endswith(".pdf")), None
)
if summary_report:
pdf_path = os.path.join(retrofit_folder_path, summary_report)
return extract_summary_report(pdf_path)
return None # If no relevant PDF is found
def extract_from_survey_folder_files(survey_folder_path):
"""
Handles extraction directly from files in the survey folder when no 'retrofit assessment' folder exists.
"""
survey_files = os.listdir(survey_folder_path)
# Look for a summary report directly in the survey folder
summary_report = next(
(name for name in survey_files if "summary" in name.lower() and name.endswith(".pdf")), None
)
if summary_report:
pdf_path = os.path.join(survey_folder_path, summary_report)
return extract_summary_report(pdf_path)
return None # If no relevant PDF is found
def main():
"""
This code prepares the data for the Warm Homes: Social Housing Fund Wave 3, for Stonewater.
@ -52,40 +88,38 @@ def main():
extracted_data = []
for survey_folder in survey_folders:
survey_folder_path = os.path.join(FILE_PATH, survey_folder)
# List the folders inside of the survey folder
survey_subfolders = [name for name in os.listdir(os.path.join(FILE_PATH, survey_folder))
if os.path.isdir(os.path.join(FILE_PATH, survey_folder, name))]
survey_subfolders = [name for name in os.listdir(survey_folder_path)
if os.path.isdir(os.path.join(survey_folder_path, name))]
if not survey_subfolders:
continue
# Check for a folder inside of the survey_subfolders containing the phrase "retrofit assessment:
# If it exists, we will use the data from that folder
# Check if there's a "retrofit assessment" folder
retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None)
# List contents of the retrofit folder
retrofit_files = os.listdir(os.path.join(FILE_PATH, survey_folder, retrofit_folder))
# If retrofit assessment folder exists, check if it has content
if retrofit_folder:
retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder)
if os.listdir(retrofit_folder_path): # If not empty
summary_data = extract_retrofit_assessment_folder(retrofit_folder_path)
if summary_data:
summary_data = {
"survey_folder": survey_folder,
**summary_data
}
extracted_data.append(summary_data)
continue
if not retrofit_files:
continue
# We now look for specific files:
# 1) Check the summary report.- the title will contain the word "summary" (lowercase) and the file extension is
# .pdf
summary_report = next(
(name for name in retrofit_files if "summary" in name.lower() and name.endswith(".pdf")), None
)
if summary_report is not None:
pdf_path = os.path.join(FILE_PATH, survey_folder, retrofit_folder, summary_report)
summary_data = extract_summary_report(pdf_path)
# If no retrofit folder or it was empty, check files in survey_folder
summary_data = extract_from_survey_folder_files(survey_folder_path)
if summary_data:
summary_data = {
"survey_folder": survey_folder,
**summary_data
}
extracted_data.append(summary_data)
continue
raise NotImplementedError("IMPLEMENT ME!")
print("Extracted Data:", extracted_data)
if __name__ == "__main__":