Model/etl/customers/cottons/parse_pdf_asset_list.py
2024-11-27 10:16:06 +00:00

64 lines
2.4 KiB
Python

import re
import pandas as pd
from PyPDF2 import PdfReader
# Paths to the uploaded files
file_paths = [
"/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged).pdf",
"/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 2.pdf",
"/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 3.pdf",
"/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 4.pdf",
"/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 5.pdf",
"/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 6.pdf"
]
# Function to extract text from PDFs
def extract_text_from_pdf_with_pypdf2(file_path):
text = ""
reader = PdfReader(file_path)
for page in reader.pages:
text += page.extract_text()
return text
# Initialize a list to hold all parsed data
all_parsed_data = []
# Process each PDF individually
for i, path in enumerate(file_paths):
# Extract text from the PDF
extracted_text = extract_text_from_pdf_with_pypdf2(path)
# Step 1: Remove titles and repeated headers
cleaned_text = re.sub(r"Managed Property Report as at \d+ \w+ \d+", "", extracted_text)
cleaned_text = re.sub(r"Code Property Address Management Type", "", cleaned_text)
# Step 2: Extract rows ending with "Managed"
rows = re.findall(r".*?Managed", cleaned_text)
# Step 3: Parse rows into structured data
parsed_data = []
for row in rows:
match = re.match(r"(\S+)\s+(.+?)\s+Managed", row.strip())
if match:
code = match.group(1).strip()
address = match.group(2).strip()
parsed_data.append((code, address, "Managed"))
# Append parsed data to the global list
all_parsed_data.extend(parsed_data)
# Provide feedback for debugging
print(f"File {i + 1} processed: {len(parsed_data)} rows")
# Step 4: Create a unified DataFrame
final_df = pd.DataFrame(all_parsed_data, columns=["Code", "Property Address", "Management Type"])
# Step 5: Save the unified DataFrame to an Excel file
final_output_file_path = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Unified_Managed_Properties_List.xlsx"
final_df.to_excel(final_output_file_path, index=False)
# Provide feedback
print(f"All files processed and combined. Total rows: {len(final_df)}")
print(f"Unified file saved to: {final_output_file_path}")