Model/utils/OsmosisCondtionReportParser.py

import re
import boto3
import PyPDF2
import fitz


class OsmosisConditionReportParser:

    def __init__(self, filekey, bucket_name=None):
        self.s3_client = boto3.client('s3')
        self.bucket_name = bucket_name
        self.filekey = filekey
        self.pdf_text = None

        self._read_file()

    def _read_file(self):
        """
        Reads the XML file either locally or from S3 and parses it using minidom.

        Raises:
            ValueError: If the file cannot be found, read, or parsed.
        """

        chunk_size = 10

        try:
            if self.bucket_name:
                # Read from S3
                raise NotImplementedError("Imeplement me")
            else:

                with fitz.open(self.filekey) as pdf:
                    text = ""
                    for page in pdf:
                        text += page.get_text()

            # Parse the XML content using minidom
            self.pdf_text = text
        except FileNotFoundError:
            raise ValueError(f"Local file not found: {self.filekey}")
        except Exception as e:
            raise ValueError(f"An error occurred while reading or parsing the XML: {e}")

    def extract(self):
        return {
            "No. of Bedrooms": int(re.search(r"No\. of Bedrooms \(Total\)\s*(\d+)", self.pdf_text).group(1)),
            "Risk Assessment Pathway": re.search(r"Risk\s*Assessment\s*Pathway\s*([A-Z])", self.pdf_text).group(1)
        }