mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
poc done for now
This commit is contained in:
parent
3cd9670d1a
commit
c6e02836a8
3 changed files with 101 additions and 6 deletions
|
|
@ -29,6 +29,7 @@ output_template = {
|
|||
"Tenant - Name": None,
|
||||
"Tenant - Phone": None,
|
||||
"R. Assessor - Name": None,
|
||||
"R. Coordinator - Name": None,
|
||||
"Trustmark Licence Number": None,
|
||||
"Retrofit Assessment Date": None,
|
||||
"Company Name": None,
|
||||
|
|
@ -100,6 +101,7 @@ def handler():
|
|||
"full sap xml": FullSapParser,
|
||||
"pulse air permeability": file_extraction_tools.PulseAirPermeabilityExtractor,
|
||||
"elmhurst project handover": file_extraction_tools.ElmhurstProjectHandoverExtractor,
|
||||
"core logic pas assessment report": file_extraction_tools.CoreLogicPasAssessmentReportExtractor,
|
||||
}
|
||||
|
||||
extracted = []
|
||||
|
|
@ -183,9 +185,10 @@ def handler():
|
|||
report_type = file_extraction_tools.detect_pdf_report_type(pdf_path=filepath)
|
||||
if report_type != "elmhurst project handover":
|
||||
continue
|
||||
blah
|
||||
file_extractor = extractors[report_type]
|
||||
|
||||
extracted_contents[report_type] = file_extractor(filepath).extract()
|
||||
|
||||
output_row_data = output_template.copy()
|
||||
|
||||
# dict_keys([ 'City/County', 'District/Town',
|
||||
|
|
@ -193,11 +196,9 @@ def handler():
|
|||
# 'Doors UMR', 'Measure Lodgement Date', 'Full Lodgement Date', 'Owner - Name', 'Owner - Phone',
|
||||
# 'Owner - Email', 'Tenant - Name', 'Tenant - Phone',
|
||||
# 'Trustmark Licence Number',
|
||||
# 'Company Name', 'Retrofit Designer Name',
|
||||
# Pre Air Tightness', 'SAP Rating Post (from EPC)', 'Post Heat
|
||||
# Transfer', 'Post Total Floor Area', 'Post Heat Demand', 'Post Air Tightness', 'Number of Eligible Measures
|
||||
# Installed', 'Total Cost of Works', 'Annual Fuel Saving (MTP)'])
|
||||
# Populate the output row data
|
||||
# Transfer', 'Post Total Floor Area', 'Post Heat Demand', 'Post Air Tightness',
|
||||
# 'Total Cost of Works', 'Annual Fuel Saving (MTP)'])
|
||||
|
||||
update_dictionary_with_check(
|
||||
output_row_data,
|
||||
|
|
@ -297,6 +298,29 @@ def handler():
|
|||
{"Pre Air Tightness": ap50}
|
||||
)
|
||||
|
||||
if extracted_contents.get("elmhurst project handover"):
|
||||
handover_to_insert = {
|
||||
"Number of Eligible Measures Installed": len(
|
||||
extracted_contents["elmhurst project handover"]["Measures Fitted"]
|
||||
),
|
||||
"Retrofit Designer Name": extracted_contents["elmhurst project handover"]["Designer Name"],
|
||||
"Company Name": extracted_contents["elmhurst project handover"]["Installer Name"],
|
||||
"R. Coordinator - Name": extracted_contents["elmhurst project handover"]["Retrofit Coordinator Name"],
|
||||
}
|
||||
update_dictionary_with_check(output_row_data, handover_to_insert)
|
||||
|
||||
if extracted_contents.get("core logic pas assessment report"):
|
||||
cr_to_insert = {
|
||||
"No. of Bedrooms": extracted_contents["core logic pas assessment report"]["Number of bedrooms"],
|
||||
}
|
||||
update_dictionary_with_check(
|
||||
output_row_data,
|
||||
cr_to_insert
|
||||
)
|
||||
|
||||
extracted.append(output_row_data)
|
||||
|
||||
extracted_df = pd.DataFrame(extracted)
|
||||
|
||||
extracted_df.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Lodgment Pilot/poc-extrcted-data.csv",
|
||||
index=False)
|
||||
|
|
|
|||
|
|
@ -11,3 +11,4 @@ pymupdf
|
|||
pytesseract
|
||||
pdf2image
|
||||
pillow
|
||||
pdfplumber
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
import PyPDF2
|
||||
import re
|
||||
import pdfplumber
|
||||
from collections import Counter
|
||||
from utils.logger import setup_logger
|
||||
from xml.dom.minidom import parseString
|
||||
|
|
@ -57,6 +58,13 @@ def is_elmhurst_project_handover(text):
|
|||
return "Retrofit_Project_Handover" in text or "Retrofit Project Handover" in text
|
||||
|
||||
|
||||
def is_core_logic_pas_assessment_report(text):
|
||||
"""
|
||||
Determines if the provided text indicates that the PDF is a PAS Assessment Report.
|
||||
"""
|
||||
return text.startswith("Generated Using CoreLogic UK PAS Assessment")
|
||||
|
||||
|
||||
def detect_pdf_report_type(pdf_path):
|
||||
"""
|
||||
Detects the type of report based on content or filename.
|
||||
|
|
@ -87,6 +95,8 @@ def detect_pdf_report_type(pdf_path):
|
|||
return "pulse air permeability"
|
||||
elif is_elmhurst_project_handover(first_page_text):
|
||||
return "elmhurst project handover"
|
||||
elif is_core_logic_pas_assessment_report(first_page_text):
|
||||
return "core logic pas assessment report"
|
||||
|
||||
return None
|
||||
|
||||
|
|
@ -1077,4 +1087,64 @@ class ElmhurstProjectHandoverExtractor:
|
|||
self.file_path = file_path
|
||||
|
||||
def extract(self):
|
||||
pass
|
||||
|
||||
with (open(self.file_path, "rb") as file):
|
||||
reader = PyPDF2.PdfReader(file)
|
||||
text = ""
|
||||
for page in reader.pages:
|
||||
text += page.extract_text()
|
||||
|
||||
data = {}
|
||||
|
||||
# Regex patterns
|
||||
patterns = {
|
||||
"Retrofit Coordinator Name": r"Retrofit Coordinator Name:\s*(.+)",
|
||||
"Retrofit Coordinator ID": r"Retrofit Coordinator ID:\s*(\d+)",
|
||||
"Measures Fitted": r"Measure\(s\) Fitted:\s*([\s\S]*?)\nRetrofit Assessor Name:",
|
||||
"Designer Name": r"Designer Name\(s\):\s*(.+)",
|
||||
"Installer Name": r"Installer Name\(s\):\s*(.+)",
|
||||
}
|
||||
|
||||
# Extract data
|
||||
for key, pattern in patterns.items():
|
||||
match = re.search(pattern, text)
|
||||
if not match:
|
||||
raise ValueError(f"Could not match {key}")
|
||||
if match:
|
||||
if key == "Measures Fitted":
|
||||
# Special handling for multiline measures
|
||||
measures = re.findall(r"[\u2022\u00b7\u25cf\uf0b7]\s*(.+)", match.group(1))
|
||||
measures = [m.strip() for m in measures]
|
||||
data[key] = measures
|
||||
else:
|
||||
data[key] = match.group(1).strip() if match else ""
|
||||
|
||||
return data
|
||||
|
||||
|
||||
class CoreLogicPasAssessmentReportExtractor:
|
||||
"""
|
||||
A utility class for extracting specific data from CoreLogic PAS Assessment Reports.
|
||||
"""
|
||||
|
||||
def __init__(self, file_path):
|
||||
self.file_path = file_path
|
||||
|
||||
def extract(self):
|
||||
data = {}
|
||||
|
||||
with pdfplumber.open(self.file_path) as pdf:
|
||||
for page in pdf.pages:
|
||||
tables = page.extract_tables()
|
||||
if tables: # If tables are detected on the page
|
||||
for table in tables:
|
||||
for row in table:
|
||||
# Check if the row contains "Number of bedrooms"
|
||||
if any("Number of bedrooms" in str(cell) for cell in row):
|
||||
# Extract the corresponding value by filtering out None and non-relevant cells
|
||||
for cell in row:
|
||||
if cell and cell.strip().isdigit(): # Check if cell contains a numeric value
|
||||
data["Number of bedrooms"] = int(cell.strip())
|
||||
break # Stop further processing once value is found
|
||||
|
||||
return data
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue