mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
49 lines
1.4 KiB
Python
49 lines
1.4 KiB
Python
import re
|
|
import boto3
|
|
import PyPDF2
|
|
import fitz
|
|
|
|
|
|
class OsmosisConditionReportParser:
|
|
|
|
def __init__(self, filekey, bucket_name=None):
|
|
self.s3_client = boto3.client('s3')
|
|
self.bucket_name = bucket_name
|
|
self.filekey = filekey
|
|
self.pdf_text = None
|
|
|
|
self._read_file()
|
|
|
|
def _read_file(self):
|
|
"""
|
|
Reads the XML file either locally or from S3 and parses it using minidom.
|
|
|
|
Raises:
|
|
ValueError: If the file cannot be found, read, or parsed.
|
|
"""
|
|
|
|
chunk_size = 10
|
|
|
|
try:
|
|
if self.bucket_name:
|
|
# Read from S3
|
|
raise NotImplementedError("Imeplement me")
|
|
else:
|
|
|
|
with fitz.open(self.filekey) as pdf:
|
|
text = ""
|
|
for page in pdf:
|
|
text += page.get_text()
|
|
|
|
# Parse the XML content using minidom
|
|
self.pdf_text = text
|
|
except FileNotFoundError:
|
|
raise ValueError(f"Local file not found: {self.filekey}")
|
|
except Exception as e:
|
|
raise ValueError(f"An error occurred while reading or parsing the XML: {e}")
|
|
|
|
def extract(self):
|
|
return {
|
|
"No. of Bedrooms": int(re.search(r"No\. of Bedrooms \(Total\)\s*(\d+)", self.pdf_text).group(1)),
|
|
"Risk Assessment Pathway": re.search(r"Risk\s*Assessment\s*Pathway\s*([A-Z])", self.pdf_text).group(1)
|
|
}
|