import re import boto3 import PyPDF2 import fitz class OsmosisConditionReportParser: def __init__(self, filekey, bucket_name=None): self.s3_client = boto3.client('s3') self.bucket_name = bucket_name self.filekey = filekey self.pdf_text = None self._read_file() def _read_file(self): """ Reads the XML file either locally or from S3 and parses it using minidom. Raises: ValueError: If the file cannot be found, read, or parsed. """ chunk_size = 10 try: if self.bucket_name: # Read from S3 raise NotImplementedError("Imeplement me") else: with fitz.open(self.filekey) as pdf: text = "" for page in pdf: text += page.get_text() # Parse the XML content using minidom self.pdf_text = text except FileNotFoundError: raise ValueError(f"Local file not found: {self.filekey}") except Exception as e: raise ValueError(f"An error occurred while reading or parsing the XML: {e}") def extract(self): return { "No. of Bedrooms": int(re.search(r"No\. of Bedrooms \(Total\)\s*(\d+)", self.pdf_text).group(1)), "Risk Assessment Pathway": re.search(r"Risk\s*Assessment\s*Pathway\s*([A-Z])", self.pdf_text).group(1) }