Model/utils/OsmosisCondtionReportParser.py
2024-11-28 08:38:38 +00:00

49 lines
1.4 KiB
Python

import re
import boto3
import PyPDF2
import fitz
class OsmosisConditionReportParser:
def __init__(self, filekey, bucket_name=None):
self.s3_client = boto3.client('s3')
self.bucket_name = bucket_name
self.filekey = filekey
self.pdf_text = None
self._read_file()
def _read_file(self):
"""
Reads the XML file either locally or from S3 and parses it using minidom.
Raises:
ValueError: If the file cannot be found, read, or parsed.
"""
chunk_size = 10
try:
if self.bucket_name:
# Read from S3
raise NotImplementedError("Imeplement me")
else:
with fitz.open(self.filekey) as pdf:
text = ""
for page in pdf:
text += page.get_text()
# Parse the XML content using minidom
self.pdf_text = text
except FileNotFoundError:
raise ValueError(f"Local file not found: {self.filekey}")
except Exception as e:
raise ValueError(f"An error occurred while reading or parsing the XML: {e}")
def extract(self):
return {
"No. of Bedrooms": int(re.search(r"No\. of Bedrooms \(Total\)\s*(\d+)", self.pdf_text).group(1)),
"Risk Assessment Pathway": re.search(r"Risk\s*Assessment\s*Pathway\s*([A-Z])", self.pdf_text).group(1)
}