poc done for now

This commit is contained in:
Khalim Conn-Kowlessar 2024-11-29 12:10:29 +00:00
parent 3cd9670d1a
commit c6e02836a8
3 changed files with 101 additions and 6 deletions

View file

@ -29,6 +29,7 @@ output_template = {
"Tenant - Name": None,
"Tenant - Phone": None,
"R. Assessor - Name": None,
"R. Coordinator - Name": None,
"Trustmark Licence Number": None,
"Retrofit Assessment Date": None,
"Company Name": None,
@ -100,6 +101,7 @@ def handler():
"full sap xml": FullSapParser,
"pulse air permeability": file_extraction_tools.PulseAirPermeabilityExtractor,
"elmhurst project handover": file_extraction_tools.ElmhurstProjectHandoverExtractor,
"core logic pas assessment report": file_extraction_tools.CoreLogicPasAssessmentReportExtractor,
}
extracted = []
@ -183,9 +185,10 @@ def handler():
report_type = file_extraction_tools.detect_pdf_report_type(pdf_path=filepath)
if report_type != "elmhurst project handover":
continue
blah
file_extractor = extractors[report_type]
extracted_contents[report_type] = file_extractor(filepath).extract()
output_row_data = output_template.copy()
# dict_keys([ 'City/County', 'District/Town',
@ -193,11 +196,9 @@ def handler():
# 'Doors UMR', 'Measure Lodgement Date', 'Full Lodgement Date', 'Owner - Name', 'Owner - Phone',
# 'Owner - Email', 'Tenant - Name', 'Tenant - Phone',
# 'Trustmark Licence Number',
# 'Company Name', 'Retrofit Designer Name',
# Pre Air Tightness', 'SAP Rating Post (from EPC)', 'Post Heat
# Transfer', 'Post Total Floor Area', 'Post Heat Demand', 'Post Air Tightness', 'Number of Eligible Measures
# Installed', 'Total Cost of Works', 'Annual Fuel Saving (MTP)'])
# Populate the output row data
# Transfer', 'Post Total Floor Area', 'Post Heat Demand', 'Post Air Tightness',
# 'Total Cost of Works', 'Annual Fuel Saving (MTP)'])
update_dictionary_with_check(
output_row_data,
@ -297,6 +298,29 @@ def handler():
{"Pre Air Tightness": ap50}
)
if extracted_contents.get("elmhurst project handover"):
handover_to_insert = {
"Number of Eligible Measures Installed": len(
extracted_contents["elmhurst project handover"]["Measures Fitted"]
),
"Retrofit Designer Name": extracted_contents["elmhurst project handover"]["Designer Name"],
"Company Name": extracted_contents["elmhurst project handover"]["Installer Name"],
"R. Coordinator - Name": extracted_contents["elmhurst project handover"]["Retrofit Coordinator Name"],
}
update_dictionary_with_check(output_row_data, handover_to_insert)
if extracted_contents.get("core logic pas assessment report"):
cr_to_insert = {
"No. of Bedrooms": extracted_contents["core logic pas assessment report"]["Number of bedrooms"],
}
update_dictionary_with_check(
output_row_data,
cr_to_insert
)
extracted.append(output_row_data)
extracted_df = pd.DataFrame(extracted)
extracted_df.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Lodgment Pilot/poc-extrcted-data.csv",
index=False)

View file

@ -11,3 +11,4 @@ pymupdf
pytesseract
pdf2image
pillow
pdfplumber

View file

@ -1,5 +1,6 @@
import PyPDF2
import re
import pdfplumber
from collections import Counter
from utils.logger import setup_logger
from xml.dom.minidom import parseString
@ -57,6 +58,13 @@ def is_elmhurst_project_handover(text):
return "Retrofit_Project_Handover" in text or "Retrofit Project Handover" in text
def is_core_logic_pas_assessment_report(text):
"""
Determines if the provided text indicates that the PDF is a PAS Assessment Report.
"""
return text.startswith("Generated Using CoreLogic UK PAS Assessment")
def detect_pdf_report_type(pdf_path):
"""
Detects the type of report based on content or filename.
@ -87,6 +95,8 @@ def detect_pdf_report_type(pdf_path):
return "pulse air permeability"
elif is_elmhurst_project_handover(first_page_text):
return "elmhurst project handover"
elif is_core_logic_pas_assessment_report(first_page_text):
return "core logic pas assessment report"
return None
@ -1077,4 +1087,64 @@ class ElmhurstProjectHandoverExtractor:
self.file_path = file_path
def extract(self):
pass
with (open(self.file_path, "rb") as file):
reader = PyPDF2.PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text()
data = {}
# Regex patterns
patterns = {
"Retrofit Coordinator Name": r"Retrofit Coordinator Name:\s*(.+)",
"Retrofit Coordinator ID": r"Retrofit Coordinator ID:\s*(\d+)",
"Measures Fitted": r"Measure\(s\) Fitted:\s*([\s\S]*?)\nRetrofit Assessor Name:",
"Designer Name": r"Designer Name\(s\):\s*(.+)",
"Installer Name": r"Installer Name\(s\):\s*(.+)",
}
# Extract data
for key, pattern in patterns.items():
match = re.search(pattern, text)
if not match:
raise ValueError(f"Could not match {key}")
if match:
if key == "Measures Fitted":
# Special handling for multiline measures
measures = re.findall(r"[\u2022\u00b7\u25cf\uf0b7]\s*(.+)", match.group(1))
measures = [m.strip() for m in measures]
data[key] = measures
else:
data[key] = match.group(1).strip() if match else ""
return data
class CoreLogicPasAssessmentReportExtractor:
"""
A utility class for extracting specific data from CoreLogic PAS Assessment Reports.
"""
def __init__(self, file_path):
self.file_path = file_path
def extract(self):
data = {}
with pdfplumber.open(self.file_path) as pdf:
for page in pdf.pages:
tables = page.extract_tables()
if tables: # If tables are detected on the page
for table in tables:
for row in table:
# Check if the row contains "Number of bedrooms"
if any("Number of bedrooms" in str(cell) for cell in row):
# Extract the corresponding value by filtering out None and non-relevant cells
for cell in row:
if cell and cell.strip().isdigit(): # Check if cell contains a numeric value
data["Number of bedrooms"] = int(cell.strip())
break # Stop further processing once value is found
return data