From a9ea89d2ae5253453e227c83c067f8a248d3f893 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 31 Oct 2024 12:03:17 +0000
Subject: [PATCH] done with stonewater for now

---
 .../stonewater/Wave 3 Preparation.py          | 144 ++++++++++++++++--
 1 file changed, 133 insertions(+), 11 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index bfdc8beb..477a73c8 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -76,10 +76,13 @@ def extract_summary_report(pdf_path):
         'First Extension Wall Area (m2)': None,
         "Number of Light Fittings": None,
         "Number of LEL Fittings": None,
-        "Number of fittings needing LEL": None
+        "Number of fittings needing LEL": None,
+        "Main Roof Type": None,
+        "Main Roof Insulation": None,
+        "Main Roof Insulation Thickness": None,
     }
 
-    with open(pdf_path, "rb") as file:
+    with (open(pdf_path, "rb") as file):
         reader = PyPDF2.PdfReader(file)
         text = ""
         for page in reader.pages:
@@ -205,6 +208,27 @@ def extract_summary_report(pdf_path):
         data["Number of LEL Fittings"] = int(re.search(r"Total number of L.E.L. fittings\s*(\d+)", text).group(1))
         data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"]
 
+        roof_section = re.search(r"8\.0 Roofs:\n(.*?)\n9\.0 Floors:", text, re.DOTALL)
+        roof_text = roof_section.group(1).strip()
+        roof_type_match = re.search(r"Type\s*([A-Za-z0-9\s]+)", roof_text)
+        data["Main Roof Type"] = roof_type_match.group(1).strip() if roof_type_match else None
+
+        # Check if "Insulation" exists between Type and Insulation Thickness
+        insulation_search = re.search(
+            r"Type\s+.*?\n(Insulation\s+(.*?)\n)?(Insulation Thickness\s+(.*?)\n)", roof_text, re.DOTALL
+        )
+
+        if insulation_search:
+            # Insulation match will be present if it exists, otherwise it will be None
+            insulation_match = insulation_search.group(2)  # Optional group for Insulation
+            insulation_thickness_match = insulation_search.group(4)  # Required group for Insulation Thickness
+
+            # Populate insulation fields
+            data["Main Roof Insulation"] = insulation_match.strip() if insulation_match else None
+            data["Main Roof Insulation Thickness"] = (
+                insulation_thickness_match.strip() if insulation_thickness_match else None
+            )
+
     return data
 
 
@@ -434,6 +458,49 @@ def extract_building_parts_summary(text):
     return dimensions
 
 
+import re
+
+
+def extract_roof_details_epr(text):
+    """
+    Extracts roof type, insulation, and insulation thickness for each building part
+    in the provided EPR PDF text.
+    """
+    # Define data structure to hold results
+    roof_data = []
+
+    # Locate each building part section
+    building_part_pattern = re.compile(
+        r"Construction details: Building part: (.*?)\n(.*?)(?=Conservatory|Construction details|$)",
+        re.DOTALL
+    )
+
+    # Extract each building part's data, including roof details
+    for match in building_part_pattern.finditer(text):
+        part_name = match.group(1).strip()
+
+        # Clean up the building part name
+        cleaned_part_name = re.sub(r" - built in.*|Room\(s\) in Roof area:.*", "", part_name).strip()
+
+        part_details = match.group(2)
+
+        # Extract Roof Type, Roof Insulation, and Roof Insulation Thickness
+        roof_type_match = re.search(r"Roof Type:\s*(.*?)(?=\n|$)", part_details)
+        roof_insulation_match = re.search(r"Roof Insulation:\s*(.*?)(?=\n|$)", part_details)
+        roof_insulation_thickness_match = re.search(r"Roof Insulation Thickness:\s*(.*?)(?=\n|$)", part_details)
+
+        # Store results for this building part
+        roof_data.append({
+            "Building Part": cleaned_part_name,
+            "Roof Type": roof_type_match.group(1).strip() if roof_type_match else None,
+            "Roof Insulation": roof_insulation_match.group(1).strip() if roof_insulation_match else None,
+            "Roof Insulation Thickness": roof_insulation_thickness_match.group(
+                1).strip() if roof_insulation_thickness_match else None,
+        })
+
+    return roof_data
+
+
 def extract_epr(pdf_path):
     """
     Extracts specific data from an Energy Report (EPR) PDF file.
@@ -471,7 +538,10 @@ def extract_epr(pdf_path):
         'First Extension Wall Area (m2)': None,
         "Number of Light Fittings": None,
         "Number of LEL Fittings": None,
-        "Number of fittings needing LEL": None
+        "Number of fittings needing LEL": None,
+        "Main Roof Type": None,
+        "Main Roof Insulation": None,
+        "Main Roof Insulation Thickness": None,
     }
 
     with open(pdf_path, "rb") as file:
@@ -590,6 +660,13 @@ def extract_epr(pdf_path):
         data["Number of LEL Fittings"] = int(lel_fittings_match.group(1))
         data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"]
 
+        roof_details = extract_roof_details_epr(text)
+        # Get from the main building
+        main_roof_details = [r for r in roof_details if "Main" in r["Building Part"]]
+        data["Main Roof Type"] = main_roof_details[0]["Roof Type"]
+        data["Main Roof Insulation"] = main_roof_details[0]["Roof Insulation"]
+        data["Main Roof Insulation Thickness"] = main_roof_details[0]["Roof Insulation Thickness"]
+
     return data
 
 
@@ -1077,13 +1154,11 @@ def main():
     # Save cost sheet - ideally this will be used as a secondary sheet for Stonewater
     cost_sheet.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - cost sheet.xlsx", index=False)
 
-    stonewater_data["Room in Roof"].value_counts()
-
     # stonewater_data[~pd.isnull(stonewater_data["Room in Roof"])]["survey_folder"].values
 
     create_proposed_wave_3_bid(
         costed_packages_filepath=os.path.join(
-            CUSTOMER_FOLDER_PATH, "Stonewater - Costed Retrofit Packages 20241030 (WIP).xlsx"
+            CUSTOMER_FOLDER_PATH, "Stonewater - Costed Retrofit Packages 20241030 (WIP) V2.xlsx"
         ),
         archetypes_sheet_filepath=os.path.join(
             CUSTOMER_FOLDER_PATH, "Stonewater SHDF_3_0_Board Triage 22.05.24 - Archetyped V3.1.xlsx"
@@ -1098,11 +1173,30 @@ def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepa
     archetypes_to_cost = costed_packages[
         [
             "Name", "Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Modelled SAP Band",
-            "Modelled SAP Rating", 'Total Cost of Measures', 'Contingency Cost',
-            'Total Cost of Measures inc Contingency'
+            "Modelled SAP Rating", "Package Ref", 'Total Cost of Measures', 'Contingency Cost',
+            'Total Cost of Measures inc Contingency', 'Main Roof Type', 'Main Roof Insulation',
+            'Main Roof Insulation Thickness', 'Existing Primary Heating System',
+            'Existing Primary Heating PCDF Reference'
         ]
     ].copy()
 
+    # Combine 'Main Roof Type', 'Main Roof Insulation', 'Main Roof Insulation Thickness', separating by colons!
+    archetypes_to_cost['Surveyed Main Roof'] = (
+        archetypes_to_cost['Main Roof Type'] + ': ' + archetypes_to_cost['Main Roof Insulation'] + ': ' +
+        archetypes_to_cost['Main Roof Insulation Thickness'].astype(str)
+    )
+
+    # Combine the heating systems, separating by colons!
+    archetypes_to_cost['Surveyed Main Heating'] = (
+        archetypes_to_cost['Existing Primary Heating System'] + ': code - ' + archetypes_to_cost[
+        'Existing Primary Heating PCDF Reference'].astype(str)
+    )
+
+    archetypes_to_cost = archetypes_to_cost.drop(
+        columns=['Main Roof Type', 'Main Roof Insulation', 'Main Roof Insulation Thickness',
+                 'Existing Primary Heating System',
+                 'Existing Primary Heating PCDF Reference'])
+
     # We take properties that are EPC D and below (61% of units)
     archetypes_to_cost = archetypes_to_cost[archetypes_to_cost["Current EPC Band"].isin(["D", "E", "F", "G"])]
 
@@ -1139,7 +1233,19 @@ def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepa
 
     match_classification = []
     for _, home in tqdm(proposed_sample.iterrows(), total=len(proposed_sample)):
-        surveyed = archetypes_to_cost[archetypes_to_cost["Archetype ID"] == home["Archetype ID"]]
+
+        surveyed = archetypes_to_cost[archetypes_to_cost["Archetype ID"] == home["Archetype ID"]].copy()
+        surveyed["Package Ref"] = surveyed["Package Ref"].astype(str)
+
+        package = " or ".join(sorted([x for x in surveyed["Package Ref"].unique() if x.strip()]))
+        package = package.replace("\n", "")
+
+        surveyed_roofs = " or ".join(sorted([x for x in surveyed["Surveyed Main Roof"].unique() if x.strip()]))
+        surveyed_roofs = surveyed_roofs.replace("\n", "")
+
+        surveyed_heating = " or ".join(sorted([x for x in surveyed["Surveyed Main Heating"].unique() if x.strip()]))
+        surveyed_heating = surveyed_heating.replace("\n", "")
+
         # We now check if we have a perfect match
         surveyed = surveyed[
             (surveyed["Property Type"] == home["Property Type"]) &
@@ -1149,17 +1255,33 @@ def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepa
             ]
 
         if surveyed.empty:
+            if package == "2B2A":
+                raise Exception("Fix me")
             match_classification.append(
                 {
                     "Address ID": home["Address ID"],
-                    "Match to Surveyed": "Approximate"
+                    "Match to Surveyed": "Approximate",
+                    "Proposed Package Ref": package,
+                    "Surveyed Archetype Roofs": surveyed_roofs,
+                    "Surveyed Archetype Heating": surveyed_heating
                 }
             )
             continue
+        # Re-do
+        package = " or ".join(sorted([x for x in surveyed["Package Ref"].unique() if x.strip()]))
+        package = package.replace("\n", "")
+        surveyed_roofs = " or ".join(sorted([x for x in surveyed["Surveyed Main Roof"].unique() if x.strip()]))
+        surveyed_roofs = surveyed_roofs.replace("\n", "")
+        surveyed_heating = " or ".join(sorted([x for x in surveyed["Surveyed Main Heating"].unique() if x.strip()]))
+        surveyed_heating = surveyed_heating.replace("\n", "")
+
         match_classification.append(
             {
                 "Address ID": home["Address ID"],
-                "Match to Surveyed": "Exact"
+                "Match to Surveyed": "Exact",
+                "Proposed Package Ref": package,
+                "Surveyed Archetype Roofs": surveyed_roofs,
+                "Surveyed Archetype Heating": surveyed_heating
             }
         )