extracting epr

2026-07-27 23:35:01 +00:00 · 2024-10-28 12:04:57 +00:00 · 2024-10-28 12:04:57 +00:00 · cf2a94cb36
commit cf2a94cb36
parent 0332c77098
1 changed files with 84 additions and 10 deletions
--- a/etl/customers/stonewater/Wave
+++ b/etl/customers/stonewater/Wave
@ -43,6 +43,65 @@ def extract_summary_report(pdf_path):
    return data


+def extract_epr(pdf_path):
+    """
+    Extracts specific data from an Energy Report (EPR) PDF file.
+    """
+    data = {
+        "Address": None,
+        "Estimated Annual Costs": None,
+        "Current SAP": None,
+        "Space Heating": None,
+        "Water Heating": None,
+        "Fuel Bill": None,
+    }
+
+    with open(pdf_path, "rb") as file:
+        reader = PyPDF2.PdfReader(file)
+        text = ""
+        for page in reader.pages:
+            text += page.extract_text()
+
+        # Extract Address
+        address_match = re.search(r"ENERGY REPORT\nDwelling Address\s*(.*?)\s*\nReference", text, re.DOTALL)
+        data["Address"] = address_match.group(1).strip()
+
+        # Extract Total Floor Area
+        area_match = re.search(r"Total Floor Area\s*(\d+ m2)", text)
+        data["Total Floor Area"] = area_match.group(1)
+
+        # Extract Estimated Annual Costs
+        cost_match = re.search(r"TOTAL\s*£(\d+)", text)
+        data["Estimated Annual Costs"] = f"£{cost_match.group(1)}"
+
+        # Extract Current SAP rating
+        # Updated Regular Expression to find "GG (1-20)" followed by two numbers
+        sap_match = re.search(r"GG \(1-20\)(\d{1,2})(\d{1,2})", text)
+
+        # Extract and validate the Current and Potential SAP ratings
+        current_sap, potential_sap = int(sap_match.group(1)), int(sap_match.group(2))
+        # Ensure potential is greater than or equal to current
+        if 1 <= current_sap <= 99 and 1 <= potential_sap <= 99 and potential_sap >= current_sap:
+            data["Current SAP"] = current_sap
+            data["Potential SAP"] = potential_sap
+        else:
+            raise ValueError("Failed to parse SAP ratings correctly due to unexpected format.")
+
+        # Extract Space Heating (kWh)
+        space_heating_match = re.search(r"Space Heating\s+(\d+)\s+kWh", text)
+        data["Space Heating"] = int(space_heating_match.group(1))
+
+        # Extract Water Heating (kWh)
+        water_heating_match = re.search(r"Water Heating\s+(\d+)\s+kWh", text)
+        data["Water Heating"] = int(water_heating_match.group(1))
+
+        # Extract Fuel Bill (total estimated costs)
+        fuel_bill_match = re.search(r"TOTAL\s*£(\d+)", text)
+        data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}"
+
+    return data
+
+
 def extract_retrofit_assessment_folder(retrofit_folder_path):
    """
    Handles extraction from a retrofit assessment folder if it exists and has content.
@ -61,22 +120,38 @@ def extract_retrofit_assessment_folder(retrofit_folder_path):
    return None  # If no relevant PDF is found


+def is_energy_report(text):
+    """
+    Determines if the provided text indicates that the PDF is an Energy Report.
+    Returns True if the text contains 'Energy Report'.
+    """
+    return text.startswith("ENERGY REPORT")
+
+
 def extract_from_survey_folder_files(survey_folder_path):
    """
    Handles extraction directly from files in the survey folder when no 'retrofit assessment' folder exists.
    """
-    survey_files = os.listdir(survey_folder_path)
+    survey_files = [f for f in os.listdir(survey_folder_path) if f.endswith(".pdf")]

-    # Look for a summary report directly in the survey folder
-    summary_report = next(
-        (name for name in survey_files if "summary" in name.lower() and name.endswith(".pdf")), None
-    )
+    for pdf_file in survey_files:
+        pdf_path = os.path.join(survey_folder_path, pdf_file)

-    if summary_report:
-        pdf_path = os.path.join(survey_folder_path, summary_report)
-        return extract_summary_report(pdf_path)
+        # Attempt to read the first page of the PDF to determine type
+        with open(pdf_path, "rb") as file:
+            reader = PyPDF2.PdfReader(file)
+            first_page_text = reader.pages[0].extract_text() if reader.pages else ""

-    return None  # If no relevant PDF is found
+            if is_energy_report(first_page_text):
+                # Treat this as an Energy Report
+                return extract_epr(pdf_path)
+            elif "summary" in pdf_file.lower():
+                # Treat this as a Summary Report
+                return extract_summary_report(pdf_path)
+            else:
+                raise NotImplementedError("Implement me")
+
+    return None


 def main():
@ -109,7 +184,6 @@ def main():
                    }
                    extracted_data.append(summary_data)
                    continue
-
        # If no retrofit folder or it was empty, check files in survey_folder
        summary_data = extract_from_survey_folder_files(survey_folder_path)
        if summary_data: