From c6e02836a88cd2a4af7dc8a6ee10e160d6e60f68 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 29 Nov 2024 12:10:29 +0000
Subject: [PATCH] poc done for now

---
 etl/lodgement/app.py           | 34 +++++++++++++---
 etl/lodgement/requirements.txt |  1 +
 utils/file_data_extraction.py  | 72 +++++++++++++++++++++++++++++++++-
 3 files changed, 101 insertions(+), 6 deletions(-)

diff --git a/etl/lodgement/app.py b/etl/lodgement/app.py
index 2bdeb3d7..c1da35dd 100644
--- a/etl/lodgement/app.py
+++ b/etl/lodgement/app.py
@@ -29,6 +29,7 @@ output_template = {
     "Tenant - Name": None,
     "Tenant - Phone": None,
     "R. Assessor - Name": None,
+    "R. Coordinator - Name": None,
     "Trustmark Licence Number": None,
     "Retrofit Assessment Date": None,
     "Company Name": None,
@@ -100,6 +101,7 @@ def handler():
         "full sap xml": FullSapParser,
         "pulse air permeability": file_extraction_tools.PulseAirPermeabilityExtractor,
         "elmhurst project handover": file_extraction_tools.ElmhurstProjectHandoverExtractor,
+        "core logic pas assessment report": file_extraction_tools.CoreLogicPasAssessmentReportExtractor,
     }
 
     extracted = []
@@ -183,9 +185,10 @@ def handler():
                 report_type = file_extraction_tools.detect_pdf_report_type(pdf_path=filepath)
                 if report_type != "elmhurst project handover":
                     continue
-                blah
                 file_extractor = extractors[report_type]
 
+                extracted_contents[report_type] = file_extractor(filepath).extract()
+
         output_row_data = output_template.copy()
 
         # dict_keys([ 'City/County', 'District/Town',
@@ -193,11 +196,9 @@ def handler():
         # 'Doors UMR', 'Measure Lodgement Date', 'Full Lodgement Date', 'Owner - Name', 'Owner - Phone',
         # 'Owner - Email', 'Tenant - Name', 'Tenant - Phone',
         # 'Trustmark Licence Number',
-        # 'Company Name', 'Retrofit Designer Name',
         # Pre Air Tightness', 'SAP Rating Post (from EPC)', 'Post Heat
-        # Transfer', 'Post Total Floor Area', 'Post Heat Demand', 'Post Air Tightness', 'Number of Eligible Measures
-        # Installed', 'Total Cost of Works', 'Annual Fuel Saving (MTP)'])
-        # Populate the output row data
+        # Transfer', 'Post Total Floor Area', 'Post Heat Demand', 'Post Air Tightness',
+        # 'Total Cost of Works', 'Annual Fuel Saving (MTP)'])
 
         update_dictionary_with_check(
             output_row_data,
@@ -297,6 +298,29 @@ def handler():
                 {"Pre Air Tightness": ap50}
             )
 
+        if extracted_contents.get("elmhurst project handover"):
+            handover_to_insert = {
+                "Number of Eligible Measures Installed": len(
+                    extracted_contents["elmhurst project handover"]["Measures Fitted"]
+                ),
+                "Retrofit Designer Name": extracted_contents["elmhurst project handover"]["Designer Name"],
+                "Company Name": extracted_contents["elmhurst project handover"]["Installer Name"],
+                "R. Coordinator - Name": extracted_contents["elmhurst project handover"]["Retrofit Coordinator Name"],
+            }
+            update_dictionary_with_check(output_row_data, handover_to_insert)
+
+        if extracted_contents.get("core logic pas assessment report"):
+            cr_to_insert = {
+                "No. of Bedrooms": extracted_contents["core logic pas assessment report"]["Number of bedrooms"],
+            }
+            update_dictionary_with_check(
+                output_row_data,
+                cr_to_insert
+            )
+
         extracted.append(output_row_data)
 
     extracted_df = pd.DataFrame(extracted)
+
+    extracted_df.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Lodgment Pilot/poc-extrcted-data.csv",
+                        index=False)
diff --git a/etl/lodgement/requirements.txt b/etl/lodgement/requirements.txt
index 09e475fe..412aed3b 100644
--- a/etl/lodgement/requirements.txt
+++ b/etl/lodgement/requirements.txt
@@ -11,3 +11,4 @@ pymupdf
 pytesseract
 pdf2image
 pillow
+pdfplumber
diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py
index ef02e7f0..2e849ef5 100644
--- a/utils/file_data_extraction.py
+++ b/utils/file_data_extraction.py
@@ -1,5 +1,6 @@
 import PyPDF2
 import re
+import pdfplumber
 from collections import Counter
 from utils.logger import setup_logger
 from xml.dom.minidom import parseString
@@ -57,6 +58,13 @@ def is_elmhurst_project_handover(text):
     return "Retrofit_Project_Handover" in text or "Retrofit Project Handover" in text
 
 
+def is_core_logic_pas_assessment_report(text):
+    """
+    Determines if the provided text indicates that the PDF is a PAS Assessment Report.
+    """
+    return text.startswith("Generated Using CoreLogic UK  PAS Assessment")
+
+
 def detect_pdf_report_type(pdf_path):
     """
     Detects the type of report based on content or filename.
@@ -87,6 +95,8 @@ def detect_pdf_report_type(pdf_path):
         return "pulse air permeability"
     elif is_elmhurst_project_handover(first_page_text):
         return "elmhurst project handover"
+    elif is_core_logic_pas_assessment_report(first_page_text):
+        return "core logic pas assessment report"
 
     return None
 
@@ -1077,4 +1087,64 @@ class ElmhurstProjectHandoverExtractor:
         self.file_path = file_path
 
     def extract(self):
-        pass
+
+        with (open(self.file_path, "rb") as file):
+            reader = PyPDF2.PdfReader(file)
+            text = ""
+            for page in reader.pages:
+                text += page.extract_text()
+
+        data = {}
+
+        # Regex patterns
+        patterns = {
+            "Retrofit Coordinator Name": r"Retrofit Coordinator Name:\s*(.+)",
+            "Retrofit Coordinator ID": r"Retrofit Coordinator ID:\s*(\d+)",
+            "Measures Fitted": r"Measure\(s\) Fitted:\s*([\s\S]*?)\nRetrofit Assessor Name:",
+            "Designer Name": r"Designer Name\(s\):\s*(.+)",
+            "Installer Name": r"Installer Name\(s\):\s*(.+)",
+        }
+
+        # Extract data
+        for key, pattern in patterns.items():
+            match = re.search(pattern, text)
+            if not match:
+                raise ValueError(f"Could not match {key}")
+            if match:
+                if key == "Measures Fitted":
+                    # Special handling for multiline measures
+                    measures = re.findall(r"[\u2022\u00b7\u25cf\uf0b7]\s*(.+)", match.group(1))
+                    measures = [m.strip() for m in measures]
+                    data[key] = measures
+                else:
+                    data[key] = match.group(1).strip() if match else ""
+
+        return data
+
+
+class CoreLogicPasAssessmentReportExtractor:
+    """
+    A utility class for extracting specific data from CoreLogic PAS Assessment Reports.
+    """
+
+    def __init__(self, file_path):
+        self.file_path = file_path
+
+    def extract(self):
+        data = {}
+
+        with pdfplumber.open(self.file_path) as pdf:
+            for page in pdf.pages:
+                tables = page.extract_tables()
+                if tables:  # If tables are detected on the page
+                    for table in tables:
+                        for row in table:
+                            # Check if the row contains "Number of bedrooms"
+                            if any("Number of bedrooms" in str(cell) for cell in row):
+                                # Extract the corresponding value by filtering out None and non-relevant cells
+                                for cell in row:
+                                    if cell and cell.strip().isdigit():  # Check if cell contains a numeric value
+                                        data["Number of bedrooms"] = int(cell.strip())
+                                        break  # Stop further processing once value is found
+
+        return data