poc done for now

2026-07-27 23:35:01 +00:00 · 2024-11-29 12:10:29 +00:00 · 2024-11-29 12:10:29 +00:00 · c6e02836a8
commit c6e02836a8
parent 3cd9670d1a
3 changed files with 101 additions and 6 deletions
--- a/etl/lodgement/app.py
+++ b/etl/lodgement/app.py
@ -29,6 +29,7 @@ output_template = {
    "Tenant - Name": None,
    "Tenant - Phone": None,
    "R. Assessor - Name": None,
+    "R. Coordinator - Name": None,
    "Trustmark Licence Number": None,
    "Retrofit Assessment Date": None,
    "Company Name": None,
@ -100,6 +101,7 @@ def handler():
        "full sap xml": FullSapParser,
        "pulse air permeability": file_extraction_tools.PulseAirPermeabilityExtractor,
        "elmhurst project handover": file_extraction_tools.ElmhurstProjectHandoverExtractor,
+        "core logic pas assessment report": file_extraction_tools.CoreLogicPasAssessmentReportExtractor,
    }

    extracted = []
@ -183,9 +185,10 @@ def handler():
                report_type = file_extraction_tools.detect_pdf_report_type(pdf_path=filepath)
                if report_type != "elmhurst project handover":
                    continue
-                blah
                file_extractor = extractors[report_type]

+                extracted_contents[report_type] = file_extractor(filepath).extract()
+
        output_row_data = output_template.copy()

        # dict_keys([ 'City/County', 'District/Town',
@ -193,11 +196,9 @@ def handler():
        # 'Doors UMR', 'Measure Lodgement Date', 'Full Lodgement Date', 'Owner - Name', 'Owner - Phone',
        # 'Owner - Email', 'Tenant - Name', 'Tenant - Phone',
        # 'Trustmark Licence Number',
-        # 'Company Name', 'Retrofit Designer Name',
        # Pre Air Tightness', 'SAP Rating Post (from EPC)', 'Post Heat
-        # Transfer', 'Post Total Floor Area', 'Post Heat Demand', 'Post Air Tightness', 'Number of Eligible Measures
-        # Installed', 'Total Cost of Works', 'Annual Fuel Saving (MTP)'])
-        # Populate the output row data
+        # Transfer', 'Post Total Floor Area', 'Post Heat Demand', 'Post Air Tightness',
+        # 'Total Cost of Works', 'Annual Fuel Saving (MTP)'])

        update_dictionary_with_check(
            output_row_data,
@ -297,6 +298,29 @@ def handler():
                {"Pre Air Tightness": ap50}
            )

+        if extracted_contents.get("elmhurst project handover"):
+            handover_to_insert = {
+                "Number of Eligible Measures Installed": len(
+                    extracted_contents["elmhurst project handover"]["Measures Fitted"]
+                ),
+                "Retrofit Designer Name": extracted_contents["elmhurst project handover"]["Designer Name"],
+                "Company Name": extracted_contents["elmhurst project handover"]["Installer Name"],
+                "R. Coordinator - Name": extracted_contents["elmhurst project handover"]["Retrofit Coordinator Name"],
+            }
+            update_dictionary_with_check(output_row_data, handover_to_insert)
+
+        if extracted_contents.get("core logic pas assessment report"):
+            cr_to_insert = {
+                "No. of Bedrooms": extracted_contents["core logic pas assessment report"]["Number of bedrooms"],
+            }
+            update_dictionary_with_check(
+                output_row_data,
+                cr_to_insert
+            )
+
        extracted.append(output_row_data)

    extracted_df = pd.DataFrame(extracted)
+
+    extracted_df.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Lodgment Pilot/poc-extrcted-data.csv",
+                        index=False)
--- a/etl/lodgement/requirements.txt
+++ b/etl/lodgement/requirements.txt
@ -11,3 +11,4 @@ pymupdf
 pytesseract
 pdf2image
 pillow
+pdfplumber
--- a/utils/file_data_extraction.py
+++ b/utils/file_data_extraction.py
@ -1,5 +1,6 @@
 import PyPDF2
 import re
+import pdfplumber
 from collections import Counter
 from utils.logger import setup_logger
 from xml.dom.minidom import parseString
@ -57,6 +58,13 @@ def is_elmhurst_project_handover(text):
    return "Retrofit_Project_Handover" in text or "Retrofit Project Handover" in text


+def is_core_logic_pas_assessment_report(text):
+    """
+    Determines if the provided text indicates that the PDF is a PAS Assessment Report.
+    """
+    return text.startswith("Generated Using CoreLogic UK  PAS Assessment")
+
+
 def detect_pdf_report_type(pdf_path):
    """
    Detects the type of report based on content or filename.
@ -87,6 +95,8 @@ def detect_pdf_report_type(pdf_path):
        return "pulse air permeability"
    elif is_elmhurst_project_handover(first_page_text):
        return "elmhurst project handover"
+    elif is_core_logic_pas_assessment_report(first_page_text):
+        return "core logic pas assessment report"

    return None

@ -1077,4 +1087,64 @@ class ElmhurstProjectHandoverExtractor:
        self.file_path = file_path

    def extract(self):
-        pass
+
+        with (open(self.file_path, "rb") as file):
+            reader = PyPDF2.PdfReader(file)
+            text = ""
+            for page in reader.pages:
+                text += page.extract_text()
+
+        data = {}
+
+        # Regex patterns
+        patterns = {
+            "Retrofit Coordinator Name": r"Retrofit Coordinator Name:\s*(.+)",
+            "Retrofit Coordinator ID": r"Retrofit Coordinator ID:\s*(\d+)",
+            "Measures Fitted": r"Measure\(s\) Fitted:\s*([\s\S]*?)\nRetrofit Assessor Name:",
+            "Designer Name": r"Designer Name\(s\):\s*(.+)",
+            "Installer Name": r"Installer Name\(s\):\s*(.+)",
+        }
+
+        # Extract data
+        for key, pattern in patterns.items():
+            match = re.search(pattern, text)
+            if not match:
+                raise ValueError(f"Could not match {key}")
+            if match:
+                if key == "Measures Fitted":
+                    # Special handling for multiline measures
+                    measures = re.findall(r"[\u2022\u00b7\u25cf\uf0b7]\s*(.+)", match.group(1))
+                    measures = [m.strip() for m in measures]
+                    data[key] = measures
+                else:
+                    data[key] = match.group(1).strip() if match else ""
+
+        return data
+
+
+class CoreLogicPasAssessmentReportExtractor:
+    """
+    A utility class for extracting specific data from CoreLogic PAS Assessment Reports.
+    """
+
+    def __init__(self, file_path):
+        self.file_path = file_path
+
+    def extract(self):
+        data = {}
+
+        with pdfplumber.open(self.file_path) as pdf:
+            for page in pdf.pages:
+                tables = page.extract_tables()
+                if tables:  # If tables are detected on the page
+                    for table in tables:
+                        for row in table:
+                            # Check if the row contains "Number of bedrooms"
+                            if any("Number of bedrooms" in str(cell) for cell in row):
+                                # Extract the corresponding value by filtering out None and non-relevant cells
+                                for cell in row:
+                                    if cell and cell.strip().isdigit():  # Check if cell contains a numeric value
+                                        data["Number of bedrooms"] = int(cell.strip())
+                                        break  # Stop further processing once value is found
+
+        return data