From c6e02836a88cd2a4af7dc8a6ee10e160d6e60f68 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 29 Nov 2024 12:10:29 +0000 Subject: [PATCH] poc done for now --- etl/lodgement/app.py | 34 +++++++++++++--- etl/lodgement/requirements.txt | 1 + utils/file_data_extraction.py | 72 +++++++++++++++++++++++++++++++++- 3 files changed, 101 insertions(+), 6 deletions(-) diff --git a/etl/lodgement/app.py b/etl/lodgement/app.py index 2bdeb3d7..c1da35dd 100644 --- a/etl/lodgement/app.py +++ b/etl/lodgement/app.py @@ -29,6 +29,7 @@ output_template = { "Tenant - Name": None, "Tenant - Phone": None, "R. Assessor - Name": None, + "R. Coordinator - Name": None, "Trustmark Licence Number": None, "Retrofit Assessment Date": None, "Company Name": None, @@ -100,6 +101,7 @@ def handler(): "full sap xml": FullSapParser, "pulse air permeability": file_extraction_tools.PulseAirPermeabilityExtractor, "elmhurst project handover": file_extraction_tools.ElmhurstProjectHandoverExtractor, + "core logic pas assessment report": file_extraction_tools.CoreLogicPasAssessmentReportExtractor, } extracted = [] @@ -183,9 +185,10 @@ def handler(): report_type = file_extraction_tools.detect_pdf_report_type(pdf_path=filepath) if report_type != "elmhurst project handover": continue - blah file_extractor = extractors[report_type] + extracted_contents[report_type] = file_extractor(filepath).extract() + output_row_data = output_template.copy() # dict_keys([ 'City/County', 'District/Town', @@ -193,11 +196,9 @@ def handler(): # 'Doors UMR', 'Measure Lodgement Date', 'Full Lodgement Date', 'Owner - Name', 'Owner - Phone', # 'Owner - Email', 'Tenant - Name', 'Tenant - Phone', # 'Trustmark Licence Number', - # 'Company Name', 'Retrofit Designer Name', # Pre Air Tightness', 'SAP Rating Post (from EPC)', 'Post Heat - # Transfer', 'Post Total Floor Area', 'Post Heat Demand', 'Post Air Tightness', 'Number of Eligible Measures - # Installed', 'Total Cost of Works', 'Annual Fuel Saving (MTP)']) - # Populate the output row data + # Transfer', 'Post Total Floor Area', 'Post Heat Demand', 'Post Air Tightness', + # 'Total Cost of Works', 'Annual Fuel Saving (MTP)']) update_dictionary_with_check( output_row_data, @@ -297,6 +298,29 @@ def handler(): {"Pre Air Tightness": ap50} ) + if extracted_contents.get("elmhurst project handover"): + handover_to_insert = { + "Number of Eligible Measures Installed": len( + extracted_contents["elmhurst project handover"]["Measures Fitted"] + ), + "Retrofit Designer Name": extracted_contents["elmhurst project handover"]["Designer Name"], + "Company Name": extracted_contents["elmhurst project handover"]["Installer Name"], + "R. Coordinator - Name": extracted_contents["elmhurst project handover"]["Retrofit Coordinator Name"], + } + update_dictionary_with_check(output_row_data, handover_to_insert) + + if extracted_contents.get("core logic pas assessment report"): + cr_to_insert = { + "No. of Bedrooms": extracted_contents["core logic pas assessment report"]["Number of bedrooms"], + } + update_dictionary_with_check( + output_row_data, + cr_to_insert + ) + extracted.append(output_row_data) extracted_df = pd.DataFrame(extracted) + + extracted_df.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Lodgment Pilot/poc-extrcted-data.csv", + index=False) diff --git a/etl/lodgement/requirements.txt b/etl/lodgement/requirements.txt index 09e475fe..412aed3b 100644 --- a/etl/lodgement/requirements.txt +++ b/etl/lodgement/requirements.txt @@ -11,3 +11,4 @@ pymupdf pytesseract pdf2image pillow +pdfplumber diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py index ef02e7f0..2e849ef5 100644 --- a/utils/file_data_extraction.py +++ b/utils/file_data_extraction.py @@ -1,5 +1,6 @@ import PyPDF2 import re +import pdfplumber from collections import Counter from utils.logger import setup_logger from xml.dom.minidom import parseString @@ -57,6 +58,13 @@ def is_elmhurst_project_handover(text): return "Retrofit_Project_Handover" in text or "Retrofit Project Handover" in text +def is_core_logic_pas_assessment_report(text): + """ + Determines if the provided text indicates that the PDF is a PAS Assessment Report. + """ + return text.startswith("Generated Using CoreLogic UK PAS Assessment") + + def detect_pdf_report_type(pdf_path): """ Detects the type of report based on content or filename. @@ -87,6 +95,8 @@ def detect_pdf_report_type(pdf_path): return "pulse air permeability" elif is_elmhurst_project_handover(first_page_text): return "elmhurst project handover" + elif is_core_logic_pas_assessment_report(first_page_text): + return "core logic pas assessment report" return None @@ -1077,4 +1087,64 @@ class ElmhurstProjectHandoverExtractor: self.file_path = file_path def extract(self): - pass + + with (open(self.file_path, "rb") as file): + reader = PyPDF2.PdfReader(file) + text = "" + for page in reader.pages: + text += page.extract_text() + + data = {} + + # Regex patterns + patterns = { + "Retrofit Coordinator Name": r"Retrofit Coordinator Name:\s*(.+)", + "Retrofit Coordinator ID": r"Retrofit Coordinator ID:\s*(\d+)", + "Measures Fitted": r"Measure\(s\) Fitted:\s*([\s\S]*?)\nRetrofit Assessor Name:", + "Designer Name": r"Designer Name\(s\):\s*(.+)", + "Installer Name": r"Installer Name\(s\):\s*(.+)", + } + + # Extract data + for key, pattern in patterns.items(): + match = re.search(pattern, text) + if not match: + raise ValueError(f"Could not match {key}") + if match: + if key == "Measures Fitted": + # Special handling for multiline measures + measures = re.findall(r"[\u2022\u00b7\u25cf\uf0b7]\s*(.+)", match.group(1)) + measures = [m.strip() for m in measures] + data[key] = measures + else: + data[key] = match.group(1).strip() if match else "" + + return data + + +class CoreLogicPasAssessmentReportExtractor: + """ + A utility class for extracting specific data from CoreLogic PAS Assessment Reports. + """ + + def __init__(self, file_path): + self.file_path = file_path + + def extract(self): + data = {} + + with pdfplumber.open(self.file_path) as pdf: + for page in pdf.pages: + tables = page.extract_tables() + if tables: # If tables are detected on the page + for table in tables: + for row in table: + # Check if the row contains "Number of bedrooms" + if any("Number of bedrooms" in str(cell) for cell in row): + # Extract the corresponding value by filtering out None and non-relevant cells + for cell in row: + if cell and cell.strip().isdigit(): # Check if cell contains a numeric value + data["Number of bedrooms"] = int(cell.strip()) + break # Stop further processing once value is found + + return data