mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
64 lines
2.4 KiB
Python
64 lines
2.4 KiB
Python
import re
|
|
import pandas as pd
|
|
from PyPDF2 import PdfReader
|
|
|
|
# Paths to the uploaded files
|
|
file_paths = [
|
|
"/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged).pdf",
|
|
"/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 2.pdf",
|
|
"/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 3.pdf",
|
|
"/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 4.pdf",
|
|
"/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 5.pdf",
|
|
"/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 6.pdf"
|
|
]
|
|
|
|
|
|
# Function to extract text from PDFs
|
|
def extract_text_from_pdf_with_pypdf2(file_path):
|
|
text = ""
|
|
reader = PdfReader(file_path)
|
|
for page in reader.pages:
|
|
text += page.extract_text()
|
|
return text
|
|
|
|
|
|
# Initialize a list to hold all parsed data
|
|
all_parsed_data = []
|
|
|
|
# Process each PDF individually
|
|
for i, path in enumerate(file_paths):
|
|
# Extract text from the PDF
|
|
extracted_text = extract_text_from_pdf_with_pypdf2(path)
|
|
|
|
# Step 1: Remove titles and repeated headers
|
|
cleaned_text = re.sub(r"Managed Property Report as at \d+ \w+ \d+", "", extracted_text)
|
|
cleaned_text = re.sub(r"Code Property Address Management Type", "", cleaned_text)
|
|
|
|
# Step 2: Extract rows ending with "Managed"
|
|
rows = re.findall(r".*?Managed", cleaned_text)
|
|
|
|
# Step 3: Parse rows into structured data
|
|
parsed_data = []
|
|
for row in rows:
|
|
match = re.match(r"(\S+)\s+(.+?)\s+Managed", row.strip())
|
|
if match:
|
|
code = match.group(1).strip()
|
|
address = match.group(2).strip()
|
|
parsed_data.append((code, address, "Managed"))
|
|
|
|
# Append parsed data to the global list
|
|
all_parsed_data.extend(parsed_data)
|
|
|
|
# Provide feedback for debugging
|
|
print(f"File {i + 1} processed: {len(parsed_data)} rows")
|
|
|
|
# Step 4: Create a unified DataFrame
|
|
final_df = pd.DataFrame(all_parsed_data, columns=["Code", "Property Address", "Management Type"])
|
|
|
|
# Step 5: Save the unified DataFrame to an Excel file
|
|
final_output_file_path = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Unified_Managed_Properties_List.xlsx"
|
|
final_df.to_excel(final_output_file_path, index=False)
|
|
|
|
# Provide feedback
|
|
print(f"All files processed and combined. Total rows: {len(final_df)}")
|
|
print(f"Unified file saved to: {final_output_file_path}")
|