import re import pandas as pd from PyPDF2 import PdfReader # Paths to the uploaded files file_paths = [ "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged).pdf", "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 2.pdf", "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 3.pdf", "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 4.pdf", "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 5.pdf", "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 6.pdf" ] # Function to extract text from PDFs def extract_text_from_pdf_with_pypdf2(file_path): text = "" reader = PdfReader(file_path) for page in reader.pages: text += page.extract_text() return text # Initialize a list to hold all parsed data all_parsed_data = [] # Process each PDF individually for i, path in enumerate(file_paths): # Extract text from the PDF extracted_text = extract_text_from_pdf_with_pypdf2(path) # Step 1: Remove titles and repeated headers cleaned_text = re.sub(r"Managed Property Report as at \d+ \w+ \d+", "", extracted_text) cleaned_text = re.sub(r"Code Property Address Management Type", "", cleaned_text) # Step 2: Extract rows ending with "Managed" rows = re.findall(r".*?Managed", cleaned_text) # Step 3: Parse rows into structured data parsed_data = [] for row in rows: match = re.match(r"(\S+)\s+(.+?)\s+Managed", row.strip()) if match: code = match.group(1).strip() address = match.group(2).strip() parsed_data.append((code, address, "Managed")) # Append parsed data to the global list all_parsed_data.extend(parsed_data) # Provide feedback for debugging print(f"File {i + 1} processed: {len(parsed_data)} rows") # Step 4: Create a unified DataFrame final_df = pd.DataFrame(all_parsed_data, columns=["Code", "Property Address", "Management Type"]) # Step 5: Save the unified DataFrame to an Excel file final_output_file_path = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Unified_Managed_Properties_List.xlsx" final_df.to_excel(final_output_file_path, index=False) # Provide feedback print(f"All files processed and combined. Total rows: {len(final_df)}") print(f"Unified file saved to: {final_output_file_path}")