From 362e657ab5f4710cf6bd472ccd14f65c9fa354e3 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 23 Oct 2024 11:45:57 +0100
Subject: [PATCH 01/59] handling different format of surveyed windows

---
 etl/customers/aiha/xml_extraction.py   | 60 ++++++++++++++++++++++++++
 etl/xml_survey_extraction/XmlParser.py | 34 ++++++++++++++-
 2 files changed, 92 insertions(+), 2 deletions(-)
 create mode 100644 etl/customers/aiha/xml_extraction.py

diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py
new file mode 100644
index 00000000..d235be78
--- /dev/null
+++ b/etl/customers/aiha/xml_extraction.py
@@ -0,0 +1,60 @@
+import os
+from io import BytesIO
+from etl.xml_survey_extraction.XmlParser import XmlParser
+
+SURVEY_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/RESIDENT SURVEYS"
+
+
+def main():
+    """
+    This script handles the extraction of data from the XML files in the survey folders.
+    :return:
+    """
+    # Step 1: List all subfolders inside SURVEY_FOLDER_PATH.
+    subfolders = [f.path for f in os.scandir(SURVEY_FOLDER_PATH) if f.is_dir()]
+
+    # Step 2: Loop through each subfolder and find the XML files.
+    extracted_surveys = []
+    for subfolder in subfolders:
+        print(f"Searching in subfolder: {subfolder}")
+
+        # Find all XML files in the current subfolder.
+        xml_files = [f for f in os.listdir(subfolder) if f.endswith('.xml')]
+
+        if not xml_files:
+            raise FileNotFoundError(f"No XML files found in subfolder: {subfolder}")
+
+        # If any XML files are found, perform the data extraction. We use the subfolder name as the survey key.
+        for xml_file in xml_files:
+            xml_path = os.path.join(subfolder, xml_file)
+            print(f"Processing XML file: {xml_path}")
+
+            # Read in the XML and parse it using the XmlParser class.
+            with open(xml_path, 'rb') as file:
+                xml_data_io = BytesIO(file.read())
+            uprn = None  # Set the UPRN if available.
+
+            # Create an XmlParser instance
+            xml_parser = XmlParser(
+                file=xml_data_io,
+                filekey=xml_path,
+                surveyor_company="",
+                uprn=uprn,
+            )
+
+            # Run the parser to extract the data
+            xml_parser.run()
+
+            # Store the extracted data for further processing
+            extracted_surveys.append({
+                "epc": xml_parser.epc,
+                "additional_data": xml_parser.additional_data,
+                "subfolder": subfolder
+            })
+
+    print(f"Extracted {len(extracted_surveys)} surveys.")
+    # Process the extracted_surveys as needed, for example, save to a database or write to a file.
+
+
+if __name__ == "__main__":
+    main()
diff --git a/etl/xml_survey_extraction/XmlParser.py b/etl/xml_survey_extraction/XmlParser.py
index ffe191a4..ed3d65d2 100644
--- a/etl/xml_survey_extraction/XmlParser.py
+++ b/etl/xml_survey_extraction/XmlParser.py
@@ -769,8 +769,6 @@ class XmlParser:
         :return:
         """
 
-        sap_windows = self.xml.getElementsByTagName("SAP-Windows")[0].getElementsByTagName("SAP-Window")
-
         glazing_type_lookup = {
             "3": "double glazing, unknown install date",
             "5": "Single glazing",
@@ -787,6 +785,38 @@ class XmlParser:
             "8": "North West"
         }
 
+        sap_windows = self.xml.getElementsByTagName("SAP-Windows")
+
+        if not sap_windows:
+            # We look for Multi-Glazed-Proportion
+            multiple_glazing_type = self.xml.getElementsByTagName("SAP-Property-Details")[0].getElementsByTagName(
+                "Multiple-Glazing-Type"
+            )[0].firstChild.nodeValue
+
+            pvc_frame = self.xml.getElementsByTagName("SAP-Property-Details")[0].getElementsByTagName(
+                "PVC-Window-Frames"
+            )[0].firstChild.nodeValue
+
+            multple_glazed_proportion = self.xml.getElementsByTagName("SAP-Property-Details")[0].getElementsByTagName(
+                "Multiple-Glazed-Proportion"
+            )[0].firstChild.nodeValue
+
+            self.windows = [
+                {
+                    "window_location": None,
+                    "window_area": None,
+                    "window_type": None,
+                    "glazing_type": glazing_type_lookup[multiple_glazing_type],
+                    "pvc_frame": pvc_frame,
+                    "glazing_gap": None,
+                    "orientation": None,
+                    "multple_glazed_proportion": multple_glazed_proportion
+                }
+            ]
+            return
+
+        sap_windows = sap_windows[0].getElementsByTagName("SAP-Window")
+
         self.windows = [
             self._parse_windows_content(
                 window=window,

From 323364e0dff03fe5a02c575cce043568eae783e4 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 23 Oct 2024 11:51:00 +0100
Subject: [PATCH 02/59] added additional built form to built form map in
 XmlParser

---
 etl/xml_survey_extraction/XmlParser.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/etl/xml_survey_extraction/XmlParser.py b/etl/xml_survey_extraction/XmlParser.py
index ed3d65d2..a0ed02e1 100644
--- a/etl/xml_survey_extraction/XmlParser.py
+++ b/etl/xml_survey_extraction/XmlParser.py
@@ -107,6 +107,7 @@ class XmlParser:
 
     BUILT_FORM_MAP = {
         "1": "Detached",
+        "2": "Semi-Detached",
         "3": "End-Terrace",
         "4": "Mid-Terrace",
     }
@@ -803,7 +804,7 @@ class XmlParser:
 
             self.windows = [
                 {
-                    "window_location": None,
+                    "window_location": "0",
                     "window_area": None,
                     "window_type": None,
                     "glazing_type": glazing_type_lookup[multiple_glazing_type],

From 8f8e85c1e1d1fa202f5ec5c4747a92fcde36b292 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 23 Oct 2024 12:01:05 +0100
Subject: [PATCH 03/59] debuggin xml extraction

---
 etl/xml_survey_extraction/XmlParser.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/etl/xml_survey_extraction/XmlParser.py b/etl/xml_survey_extraction/XmlParser.py
index a0ed02e1..a4061b3a 100644
--- a/etl/xml_survey_extraction/XmlParser.py
+++ b/etl/xml_survey_extraction/XmlParser.py
@@ -113,6 +113,7 @@ class XmlParser:
     }
 
     GLAZED_AREA_MAP = {
+        "2": "More than Typical",
         "4": "Much More Than Typical"
     }
 
@@ -121,7 +122,8 @@ class XmlParser:
     }
 
     TRANSACTION_TYPE_MAP = {
-        "13": "ECO assessment"
+        "13": "ECO assessment",
+        "14": "Stock condition survey",
     }
 
     TENURE_MAP = {
@@ -401,8 +403,13 @@ class XmlParser:
         ]
 
         wall_areas = sum([float(f["heat_loss_perimeter"]) * float(f["room_height"]) for f in main_dwelling_floors])
-        window_areas = sum([float(w["window_area"]) for w in main_dwelling_windows])
-        return wall_areas - window_areas
+        window_areas = [float(w["window_area"]) for w in main_dwelling_windows if w["window_area"] is not None]
+        if not window_areas:
+            # We discount 10% of the wall area
+            insulation_wall_area = wall_areas * 0.9
+        else:
+            insulation_wall_area = wall_areas - window_areas
+        return insulation_wall_area
 
     def extract_additional_data(self):
 
@@ -416,7 +423,8 @@ class XmlParser:
         main_dwelling_windows = [w for w in self.windows if w["window_location"] == "0"]
 
         number_of_windows = len(main_dwelling_windows)
-        windows_area = sum([float(w["window_area"]) for w in main_dwelling_windows])
+        windows_area = [float(w["window_area"]) for w in main_dwelling_windows if w["window_area"] is not None]
+        windows_area = sum(windows_area) if windows_area else None
 
         boolean_lookup = {
             "true": True,
@@ -462,7 +470,7 @@ class XmlParser:
             "cylinder_thermostat": cylinder_thermostat,
             "main_dwelling_ground_floor_area": float(main_dwelling_ground_floor_area),
             "number_of_windows": int(number_of_windows),
-            "windows_area": float(windows_area),
+            "windows_area": float(windows_area) if windows_area is not None else windows_area,
         }
 
     def get_node_value(self, tag_name):

From 60490cd4faf100fe3f66754a23effc8211b1793c Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 23 Oct 2024 14:23:20 +0100
Subject: [PATCH 04/59] xml extraction

---
 etl/xml_survey_extraction/XmlParser.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/etl/xml_survey_extraction/XmlParser.py b/etl/xml_survey_extraction/XmlParser.py
index a4061b3a..a2246629 100644
--- a/etl/xml_survey_extraction/XmlParser.py
+++ b/etl/xml_survey_extraction/XmlParser.py
@@ -804,7 +804,9 @@ class XmlParser:
 
             pvc_frame = self.xml.getElementsByTagName("SAP-Property-Details")[0].getElementsByTagName(
                 "PVC-Window-Frames"
-            )[0].firstChild.nodeValue
+            )
+
+            pvc_frame = pvc_frame[0].firstChild.nodeValue if pvc_frame else None
 
             multple_glazed_proportion = self.xml.getElementsByTagName("SAP-Property-Details")[0].getElementsByTagName(
                 "Multiple-Glazed-Proportion"

From 9d4a93ca3efa43a66c5d3f13843f4f62386e978c Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 23 Oct 2024 15:18:42 +0100
Subject: [PATCH 05/59] debugging xml extraction

---
 etl/xml_survey_extraction/XmlParser.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/etl/xml_survey_extraction/XmlParser.py b/etl/xml_survey_extraction/XmlParser.py
index a2246629..f8f2285d 100644
--- a/etl/xml_survey_extraction/XmlParser.py
+++ b/etl/xml_survey_extraction/XmlParser.py
@@ -9,7 +9,8 @@ from etl.xml_survey_extraction.pcdb import heating_data
 PROPERTY_TYPE_LOOKUP = {
     "0": "House",
     "House": "House",
-    "2": "Flat"
+    "2": "Flat",
+    "3": "Maisonette",
 }
 
 
@@ -122,6 +123,7 @@ class XmlParser:
     }
 
     TRANSACTION_TYPE_MAP = {
+        "5": "Rented (social)",
         "13": "ECO assessment",
         "14": "Stock condition survey",
     }
@@ -134,7 +136,8 @@ class XmlParser:
 
     TARIFF_MAP = {
         "1": "Dual",
-        "2": "Single"
+        "2": "Single",
+        "3": "Unknown"
     }
 
     def __init__(self, file, filekey, surveyor_company, uprn=None):
@@ -408,7 +411,7 @@ class XmlParser:
             # We discount 10% of the wall area
             insulation_wall_area = wall_areas * 0.9
         else:
-            insulation_wall_area = wall_areas - window_areas
+            insulation_wall_area = wall_areas - sum(window_areas)
         return insulation_wall_area
 
     def extract_additional_data(self):
@@ -779,6 +782,7 @@ class XmlParser:
         """
 
         glazing_type_lookup = {
+            "2": "double glazing installed during or after 2002",
             "3": "double glazing, unknown install date",
             "5": "Single glazing",
         }

From bfded2aaf985b65a5551c7f0f55706d54f36a5f7 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 23 Oct 2024 15:25:11 +0100
Subject: [PATCH 06/59] expanding xml extraction

---
 etl/xml_survey_extraction/XmlParser.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/etl/xml_survey_extraction/XmlParser.py b/etl/xml_survey_extraction/XmlParser.py
index f8f2285d..fa70b6b7 100644
--- a/etl/xml_survey_extraction/XmlParser.py
+++ b/etl/xml_survey_extraction/XmlParser.py
@@ -439,6 +439,7 @@ class XmlParser:
         cylinder_insulation_type = {
             None: "",
             "1": "Foam",
+            "2": "Jacket"
         }
 
         cylinder_insulation_thickness = int(
@@ -782,6 +783,7 @@ class XmlParser:
         """
 
         glazing_type_lookup = {
+            "ND": "Single glazing",
             "2": "double glazing installed during or after 2002",
             "3": "double glazing, unknown install date",
             "5": "Single glazing",

From ce9b3e5e2014fdeaba52ecf977618a5b16898a29 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 25 Oct 2024 18:13:28 +0100
Subject: [PATCH 07/59] creating aiha output

---
 etl/customers/aiha/xml_extraction.py | 452 ++++++++++++++++++++++++++-
 1 file changed, 448 insertions(+), 4 deletions(-)

diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py
index d235be78..416065e7 100644
--- a/etl/customers/aiha/xml_extraction.py
+++ b/etl/customers/aiha/xml_extraction.py
@@ -1,5 +1,8 @@
 import os
 from io import BytesIO
+
+import pandas as pd
+
 from etl.xml_survey_extraction.XmlParser import XmlParser
 
 SURVEY_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/RESIDENT SURVEYS"
@@ -22,7 +25,8 @@ def main():
         xml_files = [f for f in os.listdir(subfolder) if f.endswith('.xml')]
 
         if not xml_files:
-            raise FileNotFoundError(f"No XML files found in subfolder: {subfolder}")
+            print(f"No XML files found in subfolder: {subfolder}")
+            continue
 
         # If any XML files are found, perform the data extraction. We use the subfolder name as the survey key.
         for xml_file in xml_files:
@@ -44,16 +48,456 @@ def main():
 
             # Run the parser to extract the data
             xml_parser.run()
+            if not xml_parser.epc:
+                # If we don't have a lig xml
+                continue
 
             # Store the extracted data for further processing
             extracted_surveys.append({
-                "epc": xml_parser.epc,
-                "additional_data": xml_parser.additional_data,
-                "subfolder": subfolder
+                "survey_key": subfolder.split("/")[-1],
+                **xml_parser.epc,
+                **xml_parser.additional_data
             })
 
     print(f"Extracted {len(extracted_surveys)} surveys.")
     # Process the extracted_surveys as needed, for example, save to a database or write to a file.
+    extracted_surveys = pd.DataFrame(extracted_surveys)
+
+    # THis is the data we need for the AIHA project
+    measures_data = extracted_surveys[
+        ["survey_key", "address", "postcode", "current-energy-efficiency", "current-energy-rating"]
+    ]
+    measures_data = measures_data.sort_values("survey_key", ascending=True)
+
+    # Note:
+    # The properties will still have "Very poor" ratings for their hot water
+
+    # TODO
+    #   - AIH001-03 has a basement and so we should discount this area from the ground floor
+    #   - AIH001-03 has a loft that is inaccessible - ask Chenai about why this property didn't have access to the loft
+    #   - AIH001-03 instead of cylinder insulation, we could install an air source heat pump but it might not be the
+    #     best option for this property due to it being extrememly large and the walls being uninsulated. It might not
+    #     be performant enough in the winter, when COP will be more like 1.5.
+    #   - AIH001-03 - can add additional 1.6kWp solar PV to flat roof to get close to EPC C. How many occupants are
+    #     in the property? Does it make sense to have such a large solar PV system (5.6kWp)?
+    #   - AIH001-04 why couldn't the cylinder be accessed? - treating this could get to the EPC C
+    #   - Generally, should we consider insulated doors?
+    #   - AIH001-08 and AIH001-09, check if it's freehold - could solar work as both of these units are part of the same
+    #     buulding
+    #   - AIH001-09 - The extension is 1900-1929 but has a cavity wall
+    #   - AIH001-09 - Is it not possible to install a loft hatch?
+    #   - AIH001-09 - Why is there assumed secondary heating?
+    #   - AIH001-09 - Is there definitely an immersion water heater? Is this definitely the case for the other units?
+    #   - AIH001-11 - The layout of this unit is confusing, is there roof access?
+    #   - AIH001-12 - Why was there not access to the cylinder?
+    #
+
+    recommended_measures = [
+        {
+            "survey_key": "AIH001-01",
+            "starting_sap": 69,
+            "recommended_measures": [],
+            "notes": "Is EPC C"
+        },
+        {
+            "survey_key": "AIH001-03",
+            "starting_sap": 43,
+            "recommended_measures": [
+                {
+                    "measure": "Cylinder Insulation",
+                    "description": "80mm cylinder insulation",
+                    "sap_points": 1,
+                    "ending_sap": 44,
+                },
+                {
+                    "measure": "Solar PV",
+                    "description": "Solar PV system with various configurations",
+                    "config": [
+                        {
+                            "size": "4kWp",
+                            "orientation": "East",
+                            "elavation": 30,
+                            "overshading": "Modest",
+                        },
+                        {
+                            "size": "1.6kWp",
+                            "orientation": "Horizontal",
+                            "elavation": "Horizontal",
+                            "overshading": "Modest",
+                        }
+                    ],
+                    "sap_points": 7,
+                    "ending_sap": 53
+                },
+                {
+                    "measure": "Loft Insulation",
+                    "description": "300mm of loft insulation",
+                    "sap_points": 8,
+                    "ending_sap": 61
+                },
+                {
+                    "measure": "TTZC",
+                    "description": "Thermostatic Time Zone Control",
+                    "sap_points": 3,
+                    "ending_sap": 64
+                }
+            ],
+            "notes": "There was no access to the loft for this property and so a loft hatch would need to be "
+                     "installed..."
+        },
+        {
+            "survey_key": "AIH001-04",
+            "starting_sap": 48,
+            "recommended_measures": [
+                {
+                    "measure": "Flat Roof Insulation",
+                    "description": "100mm flat roof insulation",
+                    "sap_points": 4,
+                    "ending_sap": 52
+                },
+                {
+                    "measure": "TTZC",
+                    "description": "Thermostatic Time Zone Control",
+                    "sap_points": 3,
+                    "ending_sap": 55
+                },
+                {
+                    "measure": "Solar PV",
+                    "description": "Solar PV system with 4kW capacity, south-facing",
+                    "config": [
+                        {
+                            "size": "4kW",
+                            "orientation": "South",
+                            "elavation": 30,
+                            "overshading": "Modest",
+                        }
+                    ],
+                    "sap_points": 12,
+                    "ending_sap": 67
+                }
+            ],
+            "notes": ""
+        },
+        {
+            "survey_key": "AIH001-05",
+            "starting_sap": 54,
+            "recommended_measures": [
+                {
+                    "measure": "Flat Roof Insulation",
+                    "description": "100mm flat roof insulation",
+                    "sap_points": 5,
+                    "ending_sap": 59,
+                },
+                {
+                    "measure": "Cylinder Insulation",
+                    "description": "80mm cylinder insulation",
+                    "sap_points": 2,
+                    "ending_sap": 61,
+                },
+                {
+                    "measure": "Solar PV",
+                    "description": "Solar PV system with 4kW capacity, horizontal orientation",
+                    "config": [
+                        {
+                            "size": "4kW",
+                            "orientation": "Horizontal",
+                            "elavation": 30,
+                            "overshading": "Modest",
+                        }
+                    ],
+                    "sap_points": 9,
+                    "ending_sap": 70
+                },
+                {
+                    "measure": "TTZC",
+                    "description": "Thermostatic Time Zone Control",
+                    "sap_points": 3,
+                    "ending_sap": 73
+                }
+            ],
+            "notes": ""
+        },
+        {
+            "survey_key": "AIH001-06",
+            "starting_sap": 62,
+            "recommended_measures": [
+                {
+                    "measure": "Cylinder Insulation",
+                    "description": "80mm cylinder insulation",
+                    "sap_points": 2,
+                    "ending_sap": 64,
+                },
+                {
+                    "measure": "Solar PV",
+                    "description": "Solar PV system with 2kW capacity, south-facing",
+                    "config": [
+                        {
+                            "size": "2kW",
+                            "orientation": "South",
+                            "elavation": 30,
+                            "overshading": "Modest",
+                        }
+                    ],
+                    "sap_points": 6,
+                    "ending_sap": 70
+                }
+            ]
+        },
+        {
+            "survey_key": "AIH001-07",
+            "starting_sap": 74,
+            "recommended_measures": [],
+            "notes": "Is EPC C"
+        },
+        {
+            "survey_key": "AIH001-08",
+            "starting_sap": 56,
+            "recommended_measures": [
+                {
+                    "measure": "Loft Insulation",
+                    "description": "300mm of loft insulation",
+                    "sap_points": 2,
+                    "ending_sap": 58,
+                },
+                {
+                    "measure": "Cylinder Insulation",
+                    "description": "80mm cylinder insulation",
+                    "sap_points": 4,
+                    "ending_sap": 62,
+                },
+                {
+                    "measure": "Internal Wall Insulation",
+                    "description": "100mm internal wall insulation",
+                    "sap_points": 5,
+                    "ending_sap": 69,
+                },
+                {
+                    "measure": "Ventilation",
+                    "description": "Ventilation improvement",
+                    "sap_points": 0,
+                    "ending_sap": 69,
+                }
+            ]
+        },
+        {
+            "survey_key": "AIH001-09",
+            "starting_sap": 44,
+            "recommended_measures": [
+                {
+                    "measure": "Internal Wall Insulation",
+                    "description": "100mm internal wall insulation",
+                    "sap_points": 8,
+                    "ending_sap": 52,
+                },
+                {
+                    "measure": "Cavity Wall Insulation",
+                    "description": "Cavity wall insulation for extensions",
+                    "sap_points": 1,
+                    "ending_sap": 53,
+                },
+                {
+                    "measure": "Ventilation",
+                    "description": "Ventilation improvement",
+                    "sap_points": 0,
+                    "ending_sap": 53,
+                },
+                {
+                    "measure": "TTZC",
+                    "description": "Thermostatic Time Zone Control",
+                    "sap_points": 3,
+                    "ending_sap": 56,
+                }
+            ]
+        },
+        {
+            "survey_key": "AIH001-11",
+            "starting_sap": 59,
+            "recommended_measures": [
+                {
+                    "measure": "TTZC",
+                    "description": "Thermostatic Time Zone Control",
+                    "sap_points": 4,
+                    "ending_sap": 63,
+                },
+                {
+                    "measure": "Internal Wall Insulation",
+                    "description": "100mm internal wall insulation",
+                    "sap_points": 5,
+                    "ending_sap": 68,
+                },
+                {
+                    "measure": "Cylinder Insulation",
+                    "description": "80mm cylinder insulation",
+                    "sap_points": 1,
+                    "ending_sap": 69,
+                }
+            ]
+        },
+        {
+            "survey_key": "AIH001-12",
+            "starting_sap": 46,
+            "recommended_measures": [
+                {
+                    "measure": "Double Glazing",
+                    "description": "Installation of double glazing",
+                    "sap_points": 2,
+                    "ending_sap": 48,
+                },
+                {
+                    "measure": "Draught Proofing",
+                    "description": "Draught proofing improvements",
+                    "sap_points": 1,
+                    "ending_sap": 49,
+                },
+                {
+                    "measure": "Solar PV",
+                    "description": "Solar PV system with 3.2kW capacity, east-facing",
+                    "config": [
+                        {
+                            "size": "3.2W",
+                            "orientation": "East",
+                            "elavation": 30,
+                            "overshading": "Little or none",
+                        }
+                    ],
+                    "sap_points": 9,
+                    "ending_sap": 58
+                },
+                {
+                    "measure": "Air Source Heat Pump",
+                    "description": "Ecoforest ecoAIR EVI 4-20 20kW air source heat pump",
+                    "sap_points": 15,
+                    "ending_sap": 73
+                },
+                {
+                    "measure": "Tariff Review",
+                    "description": "Switch to 24-hour tariff",
+                    "sap_points": 15,
+                    "ending_sap": 88
+                }
+            ]
+        },
+        {
+            "survey_key": "AIH001-13",
+            "starting_sap": 53,
+            "recommended_measures": [
+                {
+                    "measure": "Roof Insulation",
+                    "description": "100mm+ insulation on all surfaces (ceiling u=0.16, walls u=0.3)",
+                    "sap_points": 6,
+                    "ending_sap": 59,
+                },
+                {
+                    "measure": "Flat Roof Insulation",
+                    "description": "Flat roof insulation",
+                    "sap_points": 2,
+                    "ending_sap": 61,
+                },
+                {
+                    "measure": "Cavity Wall Insulation",
+                    "description": "Cavity wall insulation",
+                    "sap_points": 6,
+                    "ending_sap": 67,
+                },
+                {
+                    "measure": "Ventilation",
+                    "description": "Ventilation improvement",
+                    "sap_points": 0,
+                    "ending_sap": 67,
+                },
+                {
+                    "measure": "TTZC",
+                    "description": "Thermostatic Time Zone Control",
+                    "sap_points": 2,
+                    "ending_sap": 69,
+                },
+                {
+                    "measure": "Solar PV",
+                    "description": "Solar PV system with 4kW capacity, flat roof installation",
+                    "config": [
+                        {
+                            "size": "4kW",
+                            "orientation": "Horizontal",
+                            "elavation": 30,
+                            "overshading": "None or little",
+                        }
+                    ],
+                    "sap_points": 9,
+                    "ending_sap": 78
+                }
+            ]
+        },
+        {
+            "survey_key": "AIH001-14",
+            "starting_sap": 63,
+            "recommended_measures": [
+                {
+                    "measure": "Cavity Wall Insulation",
+                    "description": "Insulation for cavity walls",
+                    "sap_points": 5,
+                    "ending_sap": 68,
+                },
+                {
+                    "measure": "Ventilation",
+                    "description": "Ventilation improvement",
+                    "sap_points": 0,
+                    "ending_sap": 68,
+                },
+                {
+                    "measure": "Loft Insulation",
+                    "description": "Installation of loft insulation",
+                    "sap_points": 1,
+                    "ending_sap": 69,
+                },
+                {
+                    "measure": "Solar PV",
+                    "description": "Solar PV system with 10kW capacity",
+                    "sap_points": 10,
+                    "ending_sap": 79,
+                }
+            ]
+        },
+    ]
+
+    # Step 1: Normalize the recommended_measures data into a DataFrame.
+    normalized_measures = []
+
+    for survey in recommended_measures:
+        survey_key = survey["survey_key"]
+        starting_sap = survey["starting_sap"]
+        for measure in survey.get("recommended_measures", []):
+            normalized_measures.append({
+                "survey_key": survey_key,
+                "starting_sap": starting_sap,
+                "measure": measure["measure"],
+                "description": measure.get("description", "")
+            })
+
+    # Convert the normalized list into a DataFrame.
+    measures_df = pd.DataFrame(normalized_measures)
+
+    # Step 2: Pivot the measures_df to have a column for each measure type, using the description as values.
+    pivoted_measures = measures_df.pivot_table(
+        index="survey_key",
+        columns="measure",
+        values="description",
+        aggfunc=lambda x: ' '.join(x),  # Concatenate descriptions if there are multiple entries.
+        fill_value=None
+    ).reset_index()
+
+    # Step 3: Extract starting SAP for each survey key.
+    starting_sap_df = measures_df.drop_duplicates(subset=["survey_key"])[["survey_key", "starting_sap"]]
+
+    # Merge starting SAP back onto pivoted measures.
+    result_df = pd.merge(pivoted_measures, starting_sap_df, on="survey_key", how="left")
+
+    # Step 4: Calculate the ending SAP using the total sap points.
+    # Note: If you want to use total sap points, you'll need to update the total calculation accordingly.
+
+    # Step 5: Merge the result with the measures_data to get the final DataFrame.
+    final_measures = measures_data.merge(
+        result_df, how="left", on="survey_key"
+    )
 
 
 if __name__ == "__main__":

From 56fb33a64a16261f35f286adffc8268503fac24c Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 25 Oct 2024 18:39:55 +0100
Subject: [PATCH 08/59] added placeholder pricing sheet

---
 etl/customers/aiha/xml_extraction.py | 101 ++++++++++++++++++---------
 1 file changed, 68 insertions(+), 33 deletions(-)

diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py
index 416065e7..563ed7ca 100644
--- a/etl/customers/aiha/xml_extraction.py
+++ b/etl/customers/aiha/xml_extraction.py
@@ -90,7 +90,7 @@ def main():
     #   - AIH001-09 - Is there definitely an immersion water heater? Is this definitely the case for the other units?
     #   - AIH001-11 - The layout of this unit is confusing, is there roof access?
     #   - AIH001-12 - Why was there not access to the cylinder?
-    #
+    #   - AIH001-12 - Is the need to draught proofing due to the windows?
 
     recommended_measures = [
         {
@@ -111,7 +111,7 @@ def main():
                 },
                 {
                     "measure": "Solar PV",
-                    "description": "Solar PV system with various configurations",
+                    "description": "4kWp Solar PV system",
                     "config": [
                         {
                             "size": "4kWp",
@@ -131,13 +131,13 @@ def main():
                 },
                 {
                     "measure": "Loft Insulation",
-                    "description": "300mm of loft insulation",
+                    "description": "300mm loft insulation",
                     "sap_points": 8,
                     "ending_sap": 61
                 },
                 {
                     "measure": "TTZC",
-                    "description": "Thermostatic Time Zone Control",
+                    "description": "Smart Thermostat",
                     "sap_points": 3,
                     "ending_sap": 64
                 }
@@ -157,16 +157,16 @@ def main():
                 },
                 {
                     "measure": "TTZC",
-                    "description": "Thermostatic Time Zone Control",
+                    "description": "Smart Thermostat",
                     "sap_points": 3,
                     "ending_sap": 55
                 },
                 {
                     "measure": "Solar PV",
-                    "description": "Solar PV system with 4kW capacity, south-facing",
+                    "description": "4kWp Solar PV system",
                     "config": [
                         {
-                            "size": "4kW",
+                            "size": "4kWp",
                             "orientation": "South",
                             "elavation": 30,
                             "overshading": "Modest",
@@ -196,7 +196,7 @@ def main():
                 },
                 {
                     "measure": "Solar PV",
-                    "description": "Solar PV system with 4kW capacity, horizontal orientation",
+                    "description": "4kWp Solar PV system",
                     "config": [
                         {
                             "size": "4kW",
@@ -210,7 +210,7 @@ def main():
                 },
                 {
                     "measure": "TTZC",
-                    "description": "Thermostatic Time Zone Control",
+                    "description": "Smart Thermostat",
                     "sap_points": 3,
                     "ending_sap": 73
                 }
@@ -229,7 +229,7 @@ def main():
                 },
                 {
                     "measure": "Solar PV",
-                    "description": "Solar PV system with 2kW capacity, south-facing",
+                    "description": "2kWp Solar PV system",
                     "config": [
                         {
                             "size": "2kW",
@@ -255,7 +255,7 @@ def main():
             "recommended_measures": [
                 {
                     "measure": "Loft Insulation",
-                    "description": "300mm of loft insulation",
+                    "description": "300mm loft insulation",
                     "sap_points": 2,
                     "ending_sap": 58,
                 },
@@ -273,7 +273,7 @@ def main():
                 },
                 {
                     "measure": "Ventilation",
-                    "description": "Ventilation improvement",
+                    "description": "2x DMEV fans",
                     "sap_points": 0,
                     "ending_sap": 69,
                 }
@@ -291,19 +291,19 @@ def main():
                 },
                 {
                     "measure": "Cavity Wall Insulation",
-                    "description": "Cavity wall insulation for extensions",
+                    "description": "CWI to rdSAP default standard",
                     "sap_points": 1,
                     "ending_sap": 53,
                 },
                 {
                     "measure": "Ventilation",
-                    "description": "Ventilation improvement",
+                    "description": "2x DMEV fans",
                     "sap_points": 0,
                     "ending_sap": 53,
                 },
                 {
                     "measure": "TTZC",
-                    "description": "Thermostatic Time Zone Control",
+                    "description": "Smart Thermostat",
                     "sap_points": 3,
                     "ending_sap": 56,
                 }
@@ -315,7 +315,7 @@ def main():
             "recommended_measures": [
                 {
                     "measure": "TTZC",
-                    "description": "Thermostatic Time Zone Control",
+                    "description": "Smart Thermostat",
                     "sap_points": 4,
                     "ending_sap": 63,
                 },
@@ -345,13 +345,13 @@ def main():
                 },
                 {
                     "measure": "Draught Proofing",
-                    "description": "Draught proofing improvements",
+                    "description": "Window draught proofing improvements",
                     "sap_points": 1,
                     "ending_sap": 49,
                 },
                 {
                     "measure": "Solar PV",
-                    "description": "Solar PV system with 3.2kW capacity, east-facing",
+                    "description": "3.2kWp Solar PV system",
                     "config": [
                         {
                             "size": "3.2W",
@@ -383,37 +383,37 @@ def main():
             "recommended_measures": [
                 {
                     "measure": "Roof Insulation",
-                    "description": "100mm+ insulation on all surfaces (ceiling u=0.16, walls u=0.3)",
+                    "description": "100mm+ RIR insulation on all surfaces (ceiling u=0.16, walls u=0.3)",
                     "sap_points": 6,
                     "ending_sap": 59,
                 },
                 {
                     "measure": "Flat Roof Insulation",
-                    "description": "Flat roof insulation",
+                    "description": "100mm flat roof insulation",
                     "sap_points": 2,
                     "ending_sap": 61,
                 },
                 {
                     "measure": "Cavity Wall Insulation",
-                    "description": "Cavity wall insulation",
+                    "description": "CWI to rdSAP default standard",
                     "sap_points": 6,
                     "ending_sap": 67,
                 },
                 {
                     "measure": "Ventilation",
-                    "description": "Ventilation improvement",
+                    "description": "2x DMEV fans",
                     "sap_points": 0,
                     "ending_sap": 67,
                 },
                 {
                     "measure": "TTZC",
-                    "description": "Thermostatic Time Zone Control",
+                    "description": "Smart Thermostat",
                     "sap_points": 2,
                     "ending_sap": 69,
                 },
                 {
                     "measure": "Solar PV",
-                    "description": "Solar PV system with 4kW capacity, flat roof installation",
+                    "description": "4kWp Solar PV system",
                     "config": [
                         {
                             "size": "4kW",
@@ -433,25 +433,25 @@ def main():
             "recommended_measures": [
                 {
                     "measure": "Cavity Wall Insulation",
-                    "description": "Insulation for cavity walls",
+                    "description": "CWI to rdSAP default standard",
                     "sap_points": 5,
                     "ending_sap": 68,
                 },
                 {
                     "measure": "Ventilation",
-                    "description": "Ventilation improvement",
+                    "description": "2x DMEV fans",
                     "sap_points": 0,
                     "ending_sap": 68,
                 },
                 {
                     "measure": "Loft Insulation",
-                    "description": "Installation of loft insulation",
+                    "description": "300mm loft insulation",
                     "sap_points": 1,
                     "ending_sap": 69,
                 },
                 {
                     "measure": "Solar PV",
-                    "description": "Solar PV system with 10kW capacity",
+                    "description": "3.2kWp Solar PV system",
                     "sap_points": 10,
                     "ending_sap": 79,
                 }
@@ -459,6 +459,33 @@ def main():
         },
     ]
 
+    descs = []
+    for r in recommended_measures:
+        for m in r["recommended_measures"]:
+            descs.append(m["description"])
+    descs = list(set(descs))
+
+    # TODO - need to add scaffolding
+    pricing_data = [
+        {'item': '80mm cylinder insulation', 'unit_price': None, 'unit': 'unit'},
+        {'item': '100mm internal wall insulation', 'unit_price': 244.8, 'unit': 'hlp_m2'},
+        {'item': 'CWI to rdSAP default standard', 'unit_price': 14.21, 'unit': 'hlp_m2'},
+        {'item': 'Window draught proofing improvements', 'unit_price': None, 'unit': 'unit'},
+        {'item': '100mm flat roof insulation', 'unit_price': None, 'unit': 'floor_m2'},
+        {'item': 'Switch to 24-hour tariff', 'unit_price': 0, 'unit': None},
+        {'item': '3.2kWp Solar PV system', 'unit_price': 3686, 'unit': 'unit_needs_scaffolding'},
+        {'item': 'Installation of double glazing', 'unit_price': None, 'unit': 'window'},
+        {'item': 'Ecoforest ecoAIR EVI 4-20 20kW air source heat pump', 'unit_price': 21189, 'unit': 'unit'},
+        {'item': '2kWp Solar PV system', 'unit_price': 3201, 'unit': 'unit_needs_scaffolding'},
+        {'item': '100mm+ RIR insulation on all surfaces (ceiling u=0.16, walls u=0.3)', 'unit_price': 244.80,
+         'unit': 'floor_m2'},
+        {'item': '4kWp Solar PV system', 'unit_price': 4009, 'unit': 'unit_needs_scaffolding'},
+        {'item': '300mm loft insulation', 'unit_price': 16.07, 'unit': 'floor_m2'},
+        {'item': 'Smart Thermostat', 'unit_price': 1200, 'unit': 'unit'},
+        {'item': '2x DMEV fans', 'unit_price': 1070, 'unit': 'unit'}
+    ]
+    pricing_data = pd.DataFrame(pricing_data)
+
     # Step 1: Normalize the recommended_measures data into a DataFrame.
     normalized_measures = []
 
@@ -470,7 +497,8 @@ def main():
                 "survey_key": survey_key,
                 "starting_sap": starting_sap,
                 "measure": measure["measure"],
-                "description": measure.get("description", "")
+                "description": measure.get("description", ""),
+                "sap_points": measure.get("sap_points", 0)
             })
 
     # Convert the normalized list into a DataFrame.
@@ -485,16 +513,23 @@ def main():
         fill_value=None
     ).reset_index()
 
-    # Step 3: Extract starting SAP for each survey key.
+    # Step 3: Calculate the total sap points for each survey.
+    total_sap_points = measures_df.groupby("survey_key")["sap_points"].sum().reset_index()
+    total_sap_points.columns = ["survey_key", "total_sap_points"]
+
+    # Merge total sap points into the pivoted measures.
+    pivoted_measures = pd.merge(pivoted_measures, total_sap_points, on="survey_key", how="left")
+
+    # Step 4: Extract starting SAP for each survey key.
     starting_sap_df = measures_df.drop_duplicates(subset=["survey_key"])[["survey_key", "starting_sap"]]
 
     # Merge starting SAP back onto pivoted measures.
     result_df = pd.merge(pivoted_measures, starting_sap_df, on="survey_key", how="left")
 
-    # Step 4: Calculate the ending SAP using the total sap points.
-    # Note: If you want to use total sap points, you'll need to update the total calculation accordingly.
+    # Step 5: Calculate the ending SAP.
+    result_df["ending_sap"] = result_df["starting_sap"] + result_df["total_sap_points"]
 
-    # Step 5: Merge the result with the measures_data to get the final DataFrame.
+    # Step 6: Merge the result with the measures_data to get the final DataFrame.
     final_measures = measures_data.merge(
         result_df, how="left", on="survey_key"
     )

From 93d375bc7a4f0e845c3bb13c9ff00b4b33fd7ff1 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 25 Oct 2024 19:11:40 +0100
Subject: [PATCH 09/59] adding aiha costing

---
 etl/customers/aiha/xml_extraction.py | 46 +++++++++++++++++++++++-----
 1 file changed, 38 insertions(+), 8 deletions(-)

diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py
index 563ed7ca..29ac44c6 100644
--- a/etl/customers/aiha/xml_extraction.py
+++ b/etl/customers/aiha/xml_extraction.py
@@ -65,7 +65,7 @@ def main():
 
     # THis is the data we need for the AIHA project
     measures_data = extracted_surveys[
-        ["survey_key", "address", "postcode", "current-energy-efficiency", "current-energy-rating"]
+        ["survey_key", "address", "postcode", "current-energy-efficiency", "current-energy-rating", "number_of_floors"]
     ]
     measures_data = measures_data.sort_values("survey_key", ascending=True)
 
@@ -459,15 +459,20 @@ def main():
         },
     ]
 
-    descs = []
-    for r in recommended_measures:
-        for m in r["recommended_measures"]:
-            descs.append(m["description"])
-    descs = list(set(descs))
+    scaffolding_data = [
+        {
+            "number_of_floors": 2,
+            "price": 841,
+        },
+        {
+            "number_of_floors": 3,
+            "price": 1077,
+        }
+    ]
 
-    # TODO - need to add scaffolding
+    # TODO - Need an update cost for cylinder insulation
     pricing_data = [
-        {'item': '80mm cylinder insulation', 'unit_price': None, 'unit': 'unit'},
+        {'item': '80mm cylinder insulation', 'unit_price': 50, 'unit': 'unit'},
         {'item': '100mm internal wall insulation', 'unit_price': 244.8, 'unit': 'hlp_m2'},
         {'item': 'CWI to rdSAP default standard', 'unit_price': 14.21, 'unit': 'hlp_m2'},
         {'item': 'Window draught proofing improvements', 'unit_price': None, 'unit': 'unit'},
@@ -486,6 +491,31 @@ def main():
     ]
     pricing_data = pd.DataFrame(pricing_data)
 
+    for recommendation in recommended_measures:
+
+        property_data = measures_data[measures_data["survey_key"] == recommendation["survey_key"]].squeeze()
+
+        for measure in recommendation["recommended_measures"]:
+            measure_pricing = pricing_data[pricing_data["item"] == measure["description"]]
+            measure_unit = measure_pricing["unit"].values[0]
+            if measure_unit is None:
+                blah
+                continue
+
+            if measure_unit == "unit":
+                measure["Total Cost"] = float(measure_pricing["unit_price"].values[0])
+                continue
+
+            if measure_unit == "unit_needs_scaffolding":
+                # We need the number of floors
+                n_floors = property_data["number_of_floors"]
+                cost_of_scalfolding = [x for x in scaffolding_data if x["number_of_floors"] == n_floors][0]["price"]
+                measure["Total Cost"] = float(measure_pricing["unit_price"].values[0]) + cost_of_scalfolding
+
+            blah
+
+            measure["total"] = pricing_data[pricing_data["item"] == measure["measure"]]["unit_price"].values[0]
+
     # Step 1: Normalize the recommended_measures data into a DataFrame.
     normalized_measures = []
 

From 854c784bd9c4341546ea57d2a0549b40552fbd92 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 25 Oct 2024 19:32:15 +0100
Subject: [PATCH 10/59] working on the costing methodology

---
 etl/customers/aiha/xml_extraction.py | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py
index 29ac44c6..4d4705c9 100644
--- a/etl/customers/aiha/xml_extraction.py
+++ b/etl/customers/aiha/xml_extraction.py
@@ -3,6 +3,7 @@ from io import BytesIO
 
 import pandas as pd
 
+from etl.ownership.config import EXCLUDED_UPRNS
 from etl.xml_survey_extraction.XmlParser import XmlParser
 
 SURVEY_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/RESIDENT SURVEYS"
@@ -91,6 +92,7 @@ def main():
     #   - AIH001-11 - The layout of this unit is confusing, is there roof access?
     #   - AIH001-12 - Why was there not access to the cylinder?
     #   - AIH001-12 - Is the need to draught proofing due to the windows?
+    #   - AIH001-04 - is the flat roof area correct?
 
     recommended_measures = [
         {
@@ -132,6 +134,7 @@ def main():
                 {
                     "measure": "Loft Insulation",
                     "description": "300mm loft insulation",
+                    "floor_area": 80,  # Based on area of 1st floor
                     "sap_points": 8,
                     "ending_sap": 61
                 },
@@ -152,6 +155,7 @@ def main():
                 {
                     "measure": "Flat Roof Insulation",
                     "description": "100mm flat roof insulation",
+                    "floor_area": 39.1482,  # based on area of top floor
                     "sap_points": 4,
                     "ending_sap": 52
                 },
@@ -185,6 +189,7 @@ def main():
                 {
                     "measure": "Flat Roof Insulation",
                     "description": "100mm flat roof insulation",
+                    "floor_area": 49.48,  # based on area of top floor
                     "sap_points": 5,
                     "ending_sap": 59,
                 },
@@ -256,6 +261,7 @@ def main():
                 {
                     "measure": "Loft Insulation",
                     "description": "300mm loft insulation",
+                    "floor_area": 54.2864,  # Based on area of top floor
                     "sap_points": 2,
                     "ending_sap": 58,
                 },
@@ -390,6 +396,7 @@ def main():
                 {
                     "measure": "Flat Roof Insulation",
                     "description": "100mm flat roof insulation",
+                    "floor_area": 33.06,  # Based on area of the extension
                     "sap_points": 2,
                     "ending_sap": 61,
                 },
@@ -445,7 +452,8 @@ def main():
                 },
                 {
                     "measure": "Loft Insulation",
-                    "description": "300mm loft insulation",
+                    "description": "300mm loft insulation",  # Based on area of main building
+                    "floor_area": 59.20,
                     "sap_points": 1,
                     "ending_sap": 69,
                 },
@@ -511,10 +519,18 @@ def main():
                 n_floors = property_data["number_of_floors"]
                 cost_of_scalfolding = [x for x in scaffolding_data if x["number_of_floors"] == n_floors][0]["price"]
                 measure["Total Cost"] = float(measure_pricing["unit_price"].values[0]) + cost_of_scalfolding
+                continue
 
-            blah
+            if measure_unit == "floor_m2":
+                floor_area = measure["floor_area"]
+                measure["Total Cost"] = float(measure_pricing["unit_price"].values[0]) * floor_area
+                continue
 
-            measure["total"] = pricing_data[pricing_data["item"] == measure["measure"]]["unit_price"].values[0]
+            if measure_unit == "hlp_m2":
+                hlp = measure["hlp"]
+                measure["Total Cost"] = float(measure_pricing["unit_price"].values[0]) * hlp
+
+            raise Exception("Unknown unit type")
 
     # Step 1: Normalize the recommended_measures data into a DataFrame.
     normalized_measures = []

From 8325f1bf7a7bcf0cb7ebd94f6a83c49684163e17 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 28 Oct 2024 10:18:53 +0000
Subject: [PATCH 11/59] Finished costings WIP

---
 etl/customers/aiha/xml_extraction.py | 76 ++++++++++++++++------------
 1 file changed, 44 insertions(+), 32 deletions(-)

diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py
index 4d4705c9..c246105a 100644
--- a/etl/customers/aiha/xml_extraction.py
+++ b/etl/customers/aiha/xml_extraction.py
@@ -3,10 +3,10 @@ from io import BytesIO
 
 import pandas as pd
 
-from etl.ownership.config import EXCLUDED_UPRNS
 from etl.xml_survey_extraction.XmlParser import XmlParser
 
 SURVEY_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/RESIDENT SURVEYS"
+CONTINGENCY_RATE = 0.26
 
 
 def main():
@@ -274,6 +274,7 @@ def main():
                 {
                     "measure": "Internal Wall Insulation",
                     "description": "100mm internal wall insulation",
+                    "hlp": 24.13 * 2.63,
                     "sap_points": 5,
                     "ending_sap": 69,
                 },
@@ -292,12 +293,14 @@ def main():
                 {
                     "measure": "Internal Wall Insulation",
                     "description": "100mm internal wall insulation",
+                    "hlp": (22.35 * 3.24) + (22.13 * 2.53),
                     "sap_points": 8,
                     "ending_sap": 52,
                 },
                 {
                     "measure": "Cavity Wall Insulation",
                     "description": "CWI to rdSAP default standard",
+                    "hlp": (2.68 * 2.39) + (5.93 * 2.63) + (6.13 * 2.39),  # 1st & 2nd extension
                     "sap_points": 1,
                     "ending_sap": 53,
                 },
@@ -328,6 +331,7 @@ def main():
                 {
                     "measure": "Internal Wall Insulation",
                     "description": "100mm internal wall insulation",
+                    "hlp": (18.50 * 3.12) + (19.00 * 2.75),
                     "sap_points": 5,
                     "ending_sap": 68,
                 },
@@ -346,12 +350,15 @@ def main():
                 {
                     "measure": "Double Glazing",
                     "description": "Installation of double glazing",
+                    "n_windows": 20,  # Counted the bay windows each as 3
+                    "windows_area": 10.66,
                     "sap_points": 2,
                     "ending_sap": 48,
                 },
                 {
                     "measure": "Draught Proofing",
                     "description": "Window draught proofing improvements",
+                    "n_windows": 20,  # Counted the bay windows each as 3
                     "sap_points": 1,
                     "ending_sap": 49,
                 },
@@ -390,6 +397,7 @@ def main():
                 {
                     "measure": "Roof Insulation",
                     "description": "100mm+ RIR insulation on all surfaces (ceiling u=0.16, walls u=0.3)",
+                    "floor_area": 39.75,  # based on the floor area of the RIR
                     "sap_points": 6,
                     "ending_sap": 59,
                 },
@@ -403,6 +411,7 @@ def main():
                 {
                     "measure": "Cavity Wall Insulation",
                     "description": "CWI to rdSAP default standard",
+                    "hlp": (35.40 * 2.65) + (26.70 * 2.73) + (16.30 * 2.71),  # 1st & 2nd extension
                     "sap_points": 6,
                     "ending_sap": 67,
                 },
@@ -441,6 +450,7 @@ def main():
                 {
                     "measure": "Cavity Wall Insulation",
                     "description": "CWI to rdSAP default standard",
+                    "hlp": (11.00 * 2.6) + (11.00 * 2.65) + (4.60 * 2.7),
                     "sap_points": 5,
                     "ending_sap": 68,
                 },
@@ -483,11 +493,11 @@ def main():
         {'item': '80mm cylinder insulation', 'unit_price': 50, 'unit': 'unit'},
         {'item': '100mm internal wall insulation', 'unit_price': 244.8, 'unit': 'hlp_m2'},
         {'item': 'CWI to rdSAP default standard', 'unit_price': 14.21, 'unit': 'hlp_m2'},
-        {'item': 'Window draught proofing improvements', 'unit_price': None, 'unit': 'unit'},
-        {'item': '100mm flat roof insulation', 'unit_price': None, 'unit': 'floor_m2'},
+        {'item': 'Window draught proofing improvements', 'unit_price': 63, 'unit': 'window'},
+        {'item': '100mm flat roof insulation', 'unit_price': 195, 'unit': 'floor_m2'},
         {'item': 'Switch to 24-hour tariff', 'unit_price': 0, 'unit': None},
         {'item': '3.2kWp Solar PV system', 'unit_price': 3686, 'unit': 'unit_needs_scaffolding'},
-        {'item': 'Installation of double glazing', 'unit_price': None, 'unit': 'window'},
+        {'item': 'Installation of double glazing', 'unit_price': 1074, 'unit': 'window'},
         {'item': 'Ecoforest ecoAIR EVI 4-20 20kW air source heat pump', 'unit_price': 21189, 'unit': 'unit'},
         {'item': '2kWp Solar PV system', 'unit_price': 3201, 'unit': 'unit_needs_scaffolding'},
         {'item': '100mm+ RIR insulation on all surfaces (ceiling u=0.16, walls u=0.3)', 'unit_price': 244.80,
@@ -500,51 +510,49 @@ def main():
     pricing_data = pd.DataFrame(pricing_data)
 
     for recommendation in recommended_measures:
-
         property_data = measures_data[measures_data["survey_key"] == recommendation["survey_key"]].squeeze()
+        total_cost = 0
 
         for measure in recommendation["recommended_measures"]:
             measure_pricing = pricing_data[pricing_data["item"] == measure["description"]]
             measure_unit = measure_pricing["unit"].values[0]
-            if measure_unit is None:
-                blah
-                continue
 
-            if measure_unit == "unit":
-                measure["Total Cost"] = float(measure_pricing["unit_price"].values[0])
-                continue
-
-            if measure_unit == "unit_needs_scaffolding":
-                # We need the number of floors
+            if measure_unit in ["unit", None]:
+                measure_cost = float(measure_pricing["unit_price"].values[0])
+            elif measure_unit == "unit_needs_scaffolding":
                 n_floors = property_data["number_of_floors"]
-                cost_of_scalfolding = [x for x in scaffolding_data if x["number_of_floors"] == n_floors][0]["price"]
-                measure["Total Cost"] = float(measure_pricing["unit_price"].values[0]) + cost_of_scalfolding
-                continue
+                scaffolding_cost = [x for x in scaffolding_data if x["number_of_floors"] == n_floors][0]["price"]
+                measure_cost = float(measure_pricing["unit_price"].values[0]) + scaffolding_cost
+            elif measure_unit == "floor_m2":
+                measure_cost = float(measure_pricing["unit_price"].values[0]) * measure["floor_area"]
+            elif measure_unit == "hlp_m2":
+                measure_cost = float(measure_pricing["unit_price"].values[0]) * measure["hlp"]
+            elif measure_unit == "window":
+                measure_cost = float(measure_pricing["unit_price"].values[0]) * measure["n_windows"]
+            else:
+                raise Exception("Unknown unit type")
 
-            if measure_unit == "floor_m2":
-                floor_area = measure["floor_area"]
-                measure["Total Cost"] = float(measure_pricing["unit_price"].values[0]) * floor_area
-                continue
+            measure["Total Cost"] = measure_cost
+            total_cost += measure_cost
 
-            if measure_unit == "hlp_m2":
-                hlp = measure["hlp"]
-                measure["Total Cost"] = float(measure_pricing["unit_price"].values[0]) * hlp
-
-            raise Exception("Unknown unit type")
+        recommendation["total_cost"] = total_cost
 
     # Step 1: Normalize the recommended_measures data into a DataFrame.
     normalized_measures = []
-
     for survey in recommended_measures:
         survey_key = survey["survey_key"]
         starting_sap = survey["starting_sap"]
+        total_cost = survey.get("total_cost", 0)
+
         for measure in survey.get("recommended_measures", []):
             normalized_measures.append({
                 "survey_key": survey_key,
                 "starting_sap": starting_sap,
                 "measure": measure["measure"],
                 "description": measure.get("description", ""),
-                "sap_points": measure.get("sap_points", 0)
+                "sap_points": measure.get("sap_points", 0),
+                "measure_cost": measure.get("Total Cost", 0),
+                "total_cost": total_cost
             })
 
     # Convert the normalized list into a DataFrame.
@@ -559,12 +567,16 @@ def main():
         fill_value=None
     ).reset_index()
 
-    # Step 3: Calculate the total sap points for each survey.
-    total_sap_points = measures_df.groupby("survey_key")["sap_points"].sum().reset_index()
-    total_sap_points.columns = ["survey_key", "total_sap_points"]
+    # Step 3: Calculate the total sap points and total cost for each survey.
+    sap_cost_totals = measures_df.groupby("survey_key").agg(
+        total_sap_points=("sap_points", "sum"),
+        total_cost_of_measures=("measure_cost", "sum")
+    ).reset_index()
 
     # Merge total sap points into the pivoted measures.
-    pivoted_measures = pd.merge(pivoted_measures, total_sap_points, on="survey_key", how="left")
+    pivoted_measures = pd.merge(pivoted_measures, sap_cost_totals, on="survey_key", how="left")
+    pivoted_measures["Cost Contingency"] = pivoted_measures["total_cost_of_measures"] * CONTINGENCY_RATE
+    pivoted_measures["Total Cost"] = pivoted_measures["total_cost_of_measures"] + pivoted_measures["Cost Contingency"]
 
     # Step 4: Extract starting SAP for each survey key.
     starting_sap_df = measures_df.drop_duplicates(subset=["survey_key"])[["survey_key", "starting_sap"]]

From 7513e475d3cac3a21a95b0096833a43914ee7974 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 28 Oct 2024 10:57:26 +0000
Subject: [PATCH 12/59] adding in the basic structure of the extraction code

---
 .idea/Model.iml                               |  2 +-
 .idea/misc.xml                                |  2 +-
 .../stonewater/Wave 3 Preparation.py          | 92 +++++++++++++++++++
 .../requirements/requirements-wave-3-prep.txt |  1 +
 4 files changed, 95 insertions(+), 2 deletions(-)
 create mode 100644 etl/customers/stonewater/Wave 3 Preparation.py
 create mode 100644 etl/customers/stonewater/requirements/requirements-wave-3-prep.txt

diff --git a/.idea/Model.iml b/.idea/Model.iml
index df6c4faa..762580d9 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -7,7 +7,7 @@
       <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
       <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
     </content>
-    <orderEntry type="jdk" jdkName="Fastapi-backend" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Stonewater-wave-3" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
   <component name="PyNamespacePackagesService">
diff --git a/.idea/misc.xml b/.idea/misc.xml
index 50cad4ca..c916a158 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -3,7 +3,7 @@
   <component name="Black">
     <option name="sdkName" value="Python 3.10 (backend)" />
   </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="Fastapi-backend" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Stonewater-wave-3" project-jdk-type="Python SDK" />
   <component name="PyCharmProfessionalAdvertiser">
     <option name="shown" value="true" />
   </component>
diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
new file mode 100644
index 00000000..bd916494
--- /dev/null
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -0,0 +1,92 @@
+import os
+import PyPDF2
+import re
+
+FILE_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 3 Surveys"
+
+
+def extract_summary_report(pdf_path):
+    """
+    Extracts specific data from the provided PDF file.
+    Data includes:
+    - Current SAP rating
+    - Fuel Bill
+    - Emissions (t/year)
+    """
+    data = {
+        "Current SAP rating": None,
+        "Fuel Bill": None,
+        "Emissions (t/year)": None,
+    }
+
+    with open(pdf_path, "rb") as file:
+        reader = PyPDF2.PdfReader(file)
+        text = ""
+        for page in reader.pages:
+            text += page.extract_text()
+
+        # Extract Current SAP rating
+        sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text)
+        if sap_match:
+            data["Current SAP rating"] = sap_match.group(1)
+
+        # Extract Fuel Bill
+        fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text)
+        if fuel_bill_match:
+            data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}"
+
+        # Extract Emissions
+        emissions_match = re.search(r"Emissions \(t/year\):\s*([\d.]+) tonnes", text)
+        if emissions_match:
+            data["Emissions (t/year)"] = f"{emissions_match.group(1)} tonnes"
+
+    return data
+
+
+def main():
+    """
+    This code prepares the data for the Warm Homes: Social Housing Fund Wave 3, for Stonewater.
+    """
+    # List only directories in the specified FILE_PATH
+    survey_folders = [name for name in os.listdir(FILE_PATH) if os.path.isdir(os.path.join(FILE_PATH, name))]
+
+    extracted_data = []
+    for survey_folder in survey_folders:
+        # List the folders inside of the survey folder
+        survey_subfolders = [name for name in os.listdir(os.path.join(FILE_PATH, survey_folder))
+                             if os.path.isdir(os.path.join(FILE_PATH, survey_folder, name))]
+
+        if not survey_subfolders:
+            continue
+
+        # Check for a folder inside of the survey_subfolders containing the phrase "retrofit assessment:
+        # If it exists, we will use the data from that folder
+        retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None)
+
+        # List contents of the retrofit folder
+        retrofit_files = os.listdir(os.path.join(FILE_PATH, survey_folder, retrofit_folder))
+
+        if not retrofit_files:
+            continue
+
+        # We now look for specific files:
+        # 1) Check the summary report.- the title will contain the word "summary" (lowercase) and the file extension is
+        # .pdf
+        summary_report = next(
+            (name for name in retrofit_files if "summary" in name.lower() and name.endswith(".pdf")), None
+        )
+        if summary_report is not None:
+            pdf_path = os.path.join(FILE_PATH, survey_folder, retrofit_folder, summary_report)
+            summary_data = extract_summary_report(pdf_path)
+            summary_data = {
+                "survey_folder": survey_folder,
+                **summary_data
+            }
+            extracted_data.append(summary_data)
+            continue
+
+        raise NotImplementedError("IMPLEMENT ME!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt
new file mode 100644
index 00000000..e9a5c8ea
--- /dev/null
+++ b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt
@@ -0,0 +1 @@
+PyPDF2

From 0332c77098b4b77576422eb6b1cf1898f0ed79c3 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 28 Oct 2024 11:21:54 +0000
Subject: [PATCH 13/59] [Crefactoring structure of extraction code

---
 .../stonewater/Wave 3 Preparation.py          | 80 +++++++++++++------
 1 file changed, 57 insertions(+), 23 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index bd916494..976a953f 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -43,6 +43,42 @@ def extract_summary_report(pdf_path):
     return data
 
 
+def extract_retrofit_assessment_folder(retrofit_folder_path):
+    """
+    Handles extraction from a retrofit assessment folder if it exists and has content.
+    """
+    retrofit_files = os.listdir(retrofit_folder_path)
+
+    # Find the summary report in the retrofit folder
+    summary_report = next(
+        (name for name in retrofit_files if "summary" in name.lower() and name.endswith(".pdf")), None
+    )
+
+    if summary_report:
+        pdf_path = os.path.join(retrofit_folder_path, summary_report)
+        return extract_summary_report(pdf_path)
+
+    return None  # If no relevant PDF is found
+
+
+def extract_from_survey_folder_files(survey_folder_path):
+    """
+    Handles extraction directly from files in the survey folder when no 'retrofit assessment' folder exists.
+    """
+    survey_files = os.listdir(survey_folder_path)
+
+    # Look for a summary report directly in the survey folder
+    summary_report = next(
+        (name for name in survey_files if "summary" in name.lower() and name.endswith(".pdf")), None
+    )
+
+    if summary_report:
+        pdf_path = os.path.join(survey_folder_path, summary_report)
+        return extract_summary_report(pdf_path)
+
+    return None  # If no relevant PDF is found
+
+
 def main():
     """
     This code prepares the data for the Warm Homes: Social Housing Fund Wave 3, for Stonewater.
@@ -52,40 +88,38 @@ def main():
 
     extracted_data = []
     for survey_folder in survey_folders:
+        survey_folder_path = os.path.join(FILE_PATH, survey_folder)
+
         # List the folders inside of the survey folder
-        survey_subfolders = [name for name in os.listdir(os.path.join(FILE_PATH, survey_folder))
-                             if os.path.isdir(os.path.join(FILE_PATH, survey_folder, name))]
+        survey_subfolders = [name for name in os.listdir(survey_folder_path)
+                             if os.path.isdir(os.path.join(survey_folder_path, name))]
 
-        if not survey_subfolders:
-            continue
-
-        # Check for a folder inside of the survey_subfolders containing the phrase "retrofit assessment:
-        # If it exists, we will use the data from that folder
+        # Check if there's a "retrofit assessment" folder
         retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None)
 
-        # List contents of the retrofit folder
-        retrofit_files = os.listdir(os.path.join(FILE_PATH, survey_folder, retrofit_folder))
+        # If retrofit assessment folder exists, check if it has content
+        if retrofit_folder:
+            retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder)
+            if os.listdir(retrofit_folder_path):  # If not empty
+                summary_data = extract_retrofit_assessment_folder(retrofit_folder_path)
+                if summary_data:
+                    summary_data = {
+                        "survey_folder": survey_folder,
+                        **summary_data
+                    }
+                    extracted_data.append(summary_data)
+                    continue
 
-        if not retrofit_files:
-            continue
-
-        # We now look for specific files:
-        # 1) Check the summary report.- the title will contain the word "summary" (lowercase) and the file extension is
-        # .pdf
-        summary_report = next(
-            (name for name in retrofit_files if "summary" in name.lower() and name.endswith(".pdf")), None
-        )
-        if summary_report is not None:
-            pdf_path = os.path.join(FILE_PATH, survey_folder, retrofit_folder, summary_report)
-            summary_data = extract_summary_report(pdf_path)
+        # If no retrofit folder or it was empty, check files in survey_folder
+        summary_data = extract_from_survey_folder_files(survey_folder_path)
+        if summary_data:
             summary_data = {
                 "survey_folder": survey_folder,
                 **summary_data
             }
             extracted_data.append(summary_data)
-            continue
 
-        raise NotImplementedError("IMPLEMENT ME!")
+    print("Extracted Data:", extracted_data)
 
 
 if __name__ == "__main__":

From cf2a94cb365b3903a733653136ae793b6a8299a4 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 28 Oct 2024 12:04:57 +0000
Subject: [PATCH 14/59] extracting epr

---
 .../stonewater/Wave 3 Preparation.py          | 94 +++++++++++++++++--
 1 file changed, 84 insertions(+), 10 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 976a953f..53d5bb34 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -43,6 +43,65 @@ def extract_summary_report(pdf_path):
     return data
 
 
+def extract_epr(pdf_path):
+    """
+    Extracts specific data from an Energy Report (EPR) PDF file.
+    """
+    data = {
+        "Address": None,
+        "Estimated Annual Costs": None,
+        "Current SAP": None,
+        "Space Heating": None,
+        "Water Heating": None,
+        "Fuel Bill": None,
+    }
+
+    with open(pdf_path, "rb") as file:
+        reader = PyPDF2.PdfReader(file)
+        text = ""
+        for page in reader.pages:
+            text += page.extract_text()
+
+        # Extract Address
+        address_match = re.search(r"ENERGY REPORT\nDwelling Address\s*(.*?)\s*\nReference", text, re.DOTALL)
+        data["Address"] = address_match.group(1).strip()
+
+        # Extract Total Floor Area
+        area_match = re.search(r"Total Floor Area\s*(\d+ m2)", text)
+        data["Total Floor Area"] = area_match.group(1)
+
+        # Extract Estimated Annual Costs
+        cost_match = re.search(r"TOTAL\s*£(\d+)", text)
+        data["Estimated Annual Costs"] = f"£{cost_match.group(1)}"
+
+        # Extract Current SAP rating
+        # Updated Regular Expression to find "GG (1-20)" followed by two numbers
+        sap_match = re.search(r"GG \(1-20\)(\d{1,2})(\d{1,2})", text)
+
+        # Extract and validate the Current and Potential SAP ratings
+        current_sap, potential_sap = int(sap_match.group(1)), int(sap_match.group(2))
+        # Ensure potential is greater than or equal to current
+        if 1 <= current_sap <= 99 and 1 <= potential_sap <= 99 and potential_sap >= current_sap:
+            data["Current SAP"] = current_sap
+            data["Potential SAP"] = potential_sap
+        else:
+            raise ValueError("Failed to parse SAP ratings correctly due to unexpected format.")
+
+        # Extract Space Heating (kWh)
+        space_heating_match = re.search(r"Space Heating\s+(\d+)\s+kWh", text)
+        data["Space Heating"] = int(space_heating_match.group(1))
+
+        # Extract Water Heating (kWh)
+        water_heating_match = re.search(r"Water Heating\s+(\d+)\s+kWh", text)
+        data["Water Heating"] = int(water_heating_match.group(1))
+
+        # Extract Fuel Bill (total estimated costs)
+        fuel_bill_match = re.search(r"TOTAL\s*£(\d+)", text)
+        data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}"
+
+    return data
+
+
 def extract_retrofit_assessment_folder(retrofit_folder_path):
     """
     Handles extraction from a retrofit assessment folder if it exists and has content.
@@ -61,22 +120,38 @@ def extract_retrofit_assessment_folder(retrofit_folder_path):
     return None  # If no relevant PDF is found
 
 
+def is_energy_report(text):
+    """
+    Determines if the provided text indicates that the PDF is an Energy Report.
+    Returns True if the text contains 'Energy Report'.
+    """
+    return text.startswith("ENERGY REPORT")
+
+
 def extract_from_survey_folder_files(survey_folder_path):
     """
     Handles extraction directly from files in the survey folder when no 'retrofit assessment' folder exists.
     """
-    survey_files = os.listdir(survey_folder_path)
+    survey_files = [f for f in os.listdir(survey_folder_path) if f.endswith(".pdf")]
 
-    # Look for a summary report directly in the survey folder
-    summary_report = next(
-        (name for name in survey_files if "summary" in name.lower() and name.endswith(".pdf")), None
-    )
+    for pdf_file in survey_files:
+        pdf_path = os.path.join(survey_folder_path, pdf_file)
 
-    if summary_report:
-        pdf_path = os.path.join(survey_folder_path, summary_report)
-        return extract_summary_report(pdf_path)
+        # Attempt to read the first page of the PDF to determine type
+        with open(pdf_path, "rb") as file:
+            reader = PyPDF2.PdfReader(file)
+            first_page_text = reader.pages[0].extract_text() if reader.pages else ""
 
-    return None  # If no relevant PDF is found
+            if is_energy_report(first_page_text):
+                # Treat this as an Energy Report
+                return extract_epr(pdf_path)
+            elif "summary" in pdf_file.lower():
+                # Treat this as a Summary Report
+                return extract_summary_report(pdf_path)
+            else:
+                raise NotImplementedError("Implement me")
+
+    return None
 
 
 def main():
@@ -109,7 +184,6 @@ def main():
                     }
                     extracted_data.append(summary_data)
                     continue
-
         # If no retrofit folder or it was empty, check files in survey_folder
         summary_data = extract_from_survey_folder_files(survey_folder_path)
         if summary_data:

From 33ea47e71d8b0a226629400dca5b6400b46daf96 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 28 Oct 2024 12:42:28 +0000
Subject: [PATCH 15/59] fixed address extraction

---
 .../stonewater/Wave 3 Preparation.py          | 47 ++++++++++++++-----
 .../requirements/requirements-wave-3-prep.txt |  1 +
 2 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 53d5bb34..bc567bd2 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -1,6 +1,7 @@
 import os
 import PyPDF2
 import re
+import pandas as pd
 
 FILE_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 3 Surveys"
 
@@ -11,12 +12,12 @@ def extract_summary_report(pdf_path):
     Data includes:
     - Current SAP rating
     - Fuel Bill
-    - Emissions (t/year)
+    - Address
     """
     data = {
-        "Current SAP rating": None,
+        "Address": None,
+        "Current SAP Rating": None,
         "Fuel Bill": None,
-        "Emissions (t/year)": None,
     }
 
     with open(pdf_path, "rb") as file:
@@ -28,17 +29,36 @@ def extract_summary_report(pdf_path):
         # Extract Current SAP rating
         sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text)
         if sap_match:
-            data["Current SAP rating"] = sap_match.group(1)
+            data["Current SAP Rating"] = sap_match.group(1)
 
         # Extract Fuel Bill
         fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text)
         if fuel_bill_match:
             data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}"
 
-        # Extract Emissions
-        emissions_match = re.search(r"Emissions \(t/year\):\s*([\d.]+) tonnes", text)
-        if emissions_match:
-            data["Emissions (t/year)"] = f"{emissions_match.group(1)} tonnes"
+        # Extract individual address components
+        postcode = re.search(r"Postcode:\s*(.*?)\nRegion:", text)
+        # region = re.search(r"Region:\s*(.*?)\nHouse Name:", text)
+        house_name = re.search(r"House Name:\s*(.*?)\nHouse No:", text)
+        house_no = re.search(r"House No:\s*(.*?)\nStreet:", text)
+        street = re.search(r"Street:\s*(.*?)\nLocality:", text)
+        locality = re.search(r"Locality:\s*(.*?)\nTown:", text)
+        town = re.search(r"Town:\s*(.*?)\nCounty:", text)
+        county = re.search(r"County:\s*(.*?)\nProperty Tenure:", text)
+
+        # Clean extracted values and remove any prefixes
+        address_parts = [
+            house_no.group(1).strip() if house_no else "",
+            house_name.group(1).strip() if house_name else "",
+            street.group(1).strip() if street else "",
+            locality.group(1).strip() if locality else "",
+            town.group(1).strip() if town else "",
+            county.group(1).strip() if county else "",
+            postcode.group(1).strip() if postcode else ""
+        ]
+
+        # Join non-empty parts with a comma
+        data["Address"] = ", ".join([part for part in address_parts if part])
 
     return data
 
@@ -49,8 +69,7 @@ def extract_epr(pdf_path):
     """
     data = {
         "Address": None,
-        "Estimated Annual Costs": None,
-        "Current SAP": None,
+        "Current SAP Rating": None,
         "Space Heating": None,
         "Water Heating": None,
         "Fuel Bill": None,
@@ -82,8 +101,8 @@ def extract_epr(pdf_path):
         current_sap, potential_sap = int(sap_match.group(1)), int(sap_match.group(2))
         # Ensure potential is greater than or equal to current
         if 1 <= current_sap <= 99 and 1 <= potential_sap <= 99 and potential_sap >= current_sap:
-            data["Current SAP"] = current_sap
-            data["Potential SAP"] = potential_sap
+            data["Current SAP Rating"] = current_sap
+            data["Potential SAP Rating"] = potential_sap
         else:
             raise ValueError("Failed to parse SAP ratings correctly due to unexpected format.")
 
@@ -117,6 +136,8 @@ def extract_retrofit_assessment_folder(retrofit_folder_path):
         pdf_path = os.path.join(retrofit_folder_path, summary_report)
         return extract_summary_report(pdf_path)
 
+    raise Exception("Not Implemented")
+
     return None  # If no relevant PDF is found
 
 
@@ -193,7 +214,7 @@ def main():
             }
             extracted_data.append(summary_data)
 
-    print("Extracted Data:", extracted_data)
+    extracted_data = pd.DataFrame(extracted_data)
 
 
 if __name__ == "__main__":
diff --git a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt
index e9a5c8ea..2cabb047 100644
--- a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt
+++ b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt
@@ -1 +1,2 @@
 PyPDF2
+pandas

From c68e4f017e48f4cb12639cbd9f69ce40849e68fd Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 28 Oct 2024 12:43:59 +0000
Subject: [PATCH 16/59] additional data cleaning

---
 etl/customers/stonewater/Wave 3 Preparation.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index bc567bd2..c6736ba8 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -86,12 +86,8 @@ def extract_epr(pdf_path):
         data["Address"] = address_match.group(1).strip()
 
         # Extract Total Floor Area
-        area_match = re.search(r"Total Floor Area\s*(\d+ m2)", text)
-        data["Total Floor Area"] = area_match.group(1)
-
-        # Extract Estimated Annual Costs
-        cost_match = re.search(r"TOTAL\s*£(\d+)", text)
-        data["Estimated Annual Costs"] = f"£{cost_match.group(1)}"
+        # area_match = re.search(r"Total Floor Area\s*(\d+ m2)", text)
+        # data["Total Floor Area"] = area_match.group(1)
 
         # Extract Current SAP rating
         # Updated Regular Expression to find "GG (1-20)" followed by two numbers
@@ -216,6 +212,5 @@ def main():
 
     extracted_data = pd.DataFrame(extracted_data)
 
-
-if __name__ == "__main__":
-    main()
+# if __name__ == "__main__":
+#     main()

From 70d02075cf1da79ccce4950cb8080a9b05745a6d Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 28 Oct 2024 14:16:33 +0000
Subject: [PATCH 17/59] allowing extract_retrofit_assessment_folder to handle
 eprs

---
 .../stonewater/Wave 3 Preparation.py          | 25 +++++++++++--------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index c6736ba8..14e50460 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -121,20 +121,25 @@ def extract_retrofit_assessment_folder(retrofit_folder_path):
     """
     Handles extraction from a retrofit assessment folder if it exists and has content.
     """
-    retrofit_files = os.listdir(retrofit_folder_path)
+    retrofit_files = [f for f in os.listdir(retrofit_folder_path) if f.endswith(".pdf")]
 
-    # Find the summary report in the retrofit folder
-    summary_report = next(
-        (name for name in retrofit_files if "summary" in name.lower() and name.endswith(".pdf")), None
-    )
+    for pdf_file in retrofit_files:
+        pdf_path = os.path.join(retrofit_folder_path, pdf_file)
 
-    if summary_report:
-        pdf_path = os.path.join(retrofit_folder_path, summary_report)
-        return extract_summary_report(pdf_path)
+        # Attempt to read the first page of the PDF to determine the report type
+        with open(pdf_path, "rb") as file:
+            reader = PyPDF2.PdfReader(file)
+            first_page_text = reader.pages[0].extract_text() if reader.pages else ""
 
-    raise Exception("Not Implemented")
+            if is_energy_report(first_page_text):
+                # Treat this as an Energy Report
+                return extract_epr(pdf_path)
+            elif "summary" in pdf_file.lower():
+                # Treat this as a Summary Report
+                return extract_summary_report(pdf_path)
 
-    return None  # If no relevant PDF is found
+    # If no relevant PDF is found, raise an exception
+    raise FileNotFoundError("No valid report (EPR or Summary) found in the retrofit assessment folder.")
 
 
 def is_energy_report(text):

From 371f17f87e986a5d70ae7b0e66f9748f82adac6e Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 28 Oct 2024 14:20:33 +0000
Subject: [PATCH 18/59] adding additional catch for summary report

---
 etl/customers/stonewater/Wave 3 Preparation.py     | 14 +++++++++++++-
 .../requirements/requirements-wave-3-prep.txt      |  1 +
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 14e50460..dc71d449 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -2,6 +2,7 @@ import os
 import PyPDF2
 import re
 import pandas as pd
+from tqdm import tqdm
 
 FILE_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 3 Surveys"
 
@@ -137,6 +138,10 @@ def extract_retrofit_assessment_folder(retrofit_folder_path):
             elif "summary" in pdf_file.lower():
                 # Treat this as a Summary Report
                 return extract_summary_report(pdf_path)
+            elif is_summary_report(first_page_text):
+                # other ways to detect a summary report
+                # Treat this as a Summary Report
+                return extract_summary_report(pdf_path)
 
     # If no relevant PDF is found, raise an exception
     raise FileNotFoundError("No valid report (EPR or Summary) found in the retrofit assessment folder.")
@@ -150,6 +155,13 @@ def is_energy_report(text):
     return text.startswith("ENERGY REPORT")
 
 
+def is_summary_report(text):
+    """
+    Determines if the provided text indicates that the PDF is a Summary Report.
+    """
+    return text.startswith("Summary Information")
+
+
 def extract_from_survey_folder_files(survey_folder_path):
     """
     Handles extraction directly from files in the survey folder when no 'retrofit assessment' folder exists.
@@ -184,7 +196,7 @@ def main():
     survey_folders = [name for name in os.listdir(FILE_PATH) if os.path.isdir(os.path.join(FILE_PATH, name))]
 
     extracted_data = []
-    for survey_folder in survey_folders:
+    for survey_folder in tqdm(survey_folders):
         survey_folder_path = os.path.join(FILE_PATH, survey_folder)
 
         # List the folders inside of the survey folder
diff --git a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt
index 2cabb047..70bec3cc 100644
--- a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt
+++ b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt
@@ -1,2 +1,3 @@
 PyPDF2
 pandas
+tqdm

From 4e9acdeb8e2222b7c44c05749667fe258fa87982 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 28 Oct 2024 14:23:34 +0000
Subject: [PATCH 19/59] refactored

---
 .../stonewater/Wave 3 Preparation.py          | 67 +++++++------------
 1 file changed, 24 insertions(+), 43 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index dc71d449..30a23e86 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -118,30 +118,15 @@ def extract_epr(pdf_path):
     return data
 
 
-def extract_retrofit_assessment_folder(retrofit_folder_path):
+def extract_retrofit_pdfs(data_folder_path):
     """
-    Handles extraction from a retrofit assessment folder if it exists and has content.
+    Handles extraction from a retrofit data folder if it exists and has content.
     """
-    retrofit_files = [f for f in os.listdir(retrofit_folder_path) if f.endswith(".pdf")]
+    retrofit_files = [f for f in os.listdir(data_folder_path) if f.endswith(".pdf")]
 
     for pdf_file in retrofit_files:
-        pdf_path = os.path.join(retrofit_folder_path, pdf_file)
-
-        # Attempt to read the first page of the PDF to determine the report type
-        with open(pdf_path, "rb") as file:
-            reader = PyPDF2.PdfReader(file)
-            first_page_text = reader.pages[0].extract_text() if reader.pages else ""
-
-            if is_energy_report(first_page_text):
-                # Treat this as an Energy Report
-                return extract_epr(pdf_path)
-            elif "summary" in pdf_file.lower():
-                # Treat this as a Summary Report
-                return extract_summary_report(pdf_path)
-            elif is_summary_report(first_page_text):
-                # other ways to detect a summary report
-                # Treat this as a Summary Report
-                return extract_summary_report(pdf_path)
+        pdf_path = os.path.join(data_folder_path, pdf_file)
+        return detect_and_parse_report(pdf_path, pdf_file)
 
     # If no relevant PDF is found, raise an exception
     raise FileNotFoundError("No valid report (EPR or Summary) found in the retrofit assessment folder.")
@@ -162,30 +147,26 @@ def is_summary_report(text):
     return text.startswith("Summary Information")
 
 
-def extract_from_survey_folder_files(survey_folder_path):
+def detect_and_parse_report(pdf_path, pdf_file):
     """
-    Handles extraction directly from files in the survey folder when no 'retrofit assessment' folder exists.
+    Detects the type of report and extracts the relevant data.
+    :param pdf_path: String path to the PDF file
+    :param pdf_file: String name of the PDF file
+    :return:
     """
-    survey_files = [f for f in os.listdir(survey_folder_path) if f.endswith(".pdf")]
+    # Attempt to read the first page of the PDF to determine type
+    with open(pdf_path, "rb") as file:
+        reader = PyPDF2.PdfReader(file)
+        first_page_text = reader.pages[0].extract_text() if reader.pages else ""
 
-    for pdf_file in survey_files:
-        pdf_path = os.path.join(survey_folder_path, pdf_file)
-
-        # Attempt to read the first page of the PDF to determine type
-        with open(pdf_path, "rb") as file:
-            reader = PyPDF2.PdfReader(file)
-            first_page_text = reader.pages[0].extract_text() if reader.pages else ""
-
-            if is_energy_report(first_page_text):
-                # Treat this as an Energy Report
-                return extract_epr(pdf_path)
-            elif "summary" in pdf_file.lower():
-                # Treat this as a Summary Report
-                return extract_summary_report(pdf_path)
-            else:
-                raise NotImplementedError("Implement me")
-
-    return None
+        if is_energy_report(first_page_text):
+            # Treat this as an Energy Report
+            return extract_epr(pdf_path)
+        elif "summary" in pdf_file.lower():
+            # Treat this as a Summary Report
+            return extract_summary_report(pdf_path)
+        else:
+            raise NotImplementedError("Implement me")
 
 
 def main():
@@ -210,7 +191,7 @@ def main():
         if retrofit_folder:
             retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder)
             if os.listdir(retrofit_folder_path):  # If not empty
-                summary_data = extract_retrofit_assessment_folder(retrofit_folder_path)
+                summary_data = extract_retrofit_pdfs(retrofit_folder_path)
                 if summary_data:
                     summary_data = {
                         "survey_folder": survey_folder,
@@ -219,7 +200,7 @@ def main():
                     extracted_data.append(summary_data)
                     continue
         # If no retrofit folder or it was empty, check files in survey_folder
-        summary_data = extract_from_survey_folder_files(survey_folder_path)
+        summary_data = extract_retrofit_pdfs(survey_folder_path)
         if summary_data:
             summary_data = {
                 "survey_folder": survey_folder,

From 1db4c4319e2b7992405fb977705a90e8b3fb8618 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 28 Oct 2024 14:28:27 +0000
Subject: [PATCH 20/59] removing raising of exception at end of function

---
 etl/customers/stonewater/Wave 3 Preparation.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 30a23e86..777f96c5 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -128,8 +128,8 @@ def extract_retrofit_pdfs(data_folder_path):
         pdf_path = os.path.join(data_folder_path, pdf_file)
         return detect_and_parse_report(pdf_path, pdf_file)
 
-    # If no relevant PDF is found, raise an exception
-    raise FileNotFoundError("No valid report (EPR or Summary) found in the retrofit assessment folder.")
+    # If no relevant PDF is found, exit
+    return None
 
 
 def is_energy_report(text):
@@ -199,6 +199,10 @@ def main():
                     }
                     extracted_data.append(summary_data)
                     continue
+            else:
+                # Then we have an empty Retrofit Assessment folder
+                continue
+
         # If no retrofit folder or it was empty, check files in survey_folder
         summary_data = extract_retrofit_pdfs(survey_folder_path)
         if summary_data:

From 2a17831c7223e7614c6413c2f2b4fa09aca3d3a9 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 28 Oct 2024 17:16:27 +0000
Subject: [PATCH 21/59] added detection of condition report

---
 etl/customers/aiha/xml_extraction.py          | 26 ++++++++++---------
 .../stonewater/Wave 3 Preparation.py          | 18 ++++++++++---
 2 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py
index c246105a..038e8593 100644
--- a/etl/customers/aiha/xml_extraction.py
+++ b/etl/customers/aiha/xml_extraction.py
@@ -74,25 +74,26 @@ def main():
     # The properties will still have "Very poor" ratings for their hot water
 
     # TODO
-    #   - AIH001-03 has a basement and so we should discount this area from the ground floor
     #   - AIH001-03 has a loft that is inaccessible - ask Chenai about why this property didn't have access to the loft
+    #     [Can't remember, not clear - Chenai will check]
     #   - AIH001-03 instead of cylinder insulation, we could install an air source heat pump but it might not be the
     #     best option for this property due to it being extrememly large and the walls being uninsulated. It might not
     #     be performant enough in the winter, when COP will be more like 1.5.
     #   - AIH001-03 - can add additional 1.6kWp solar PV to flat roof to get close to EPC C. How many occupants are
     #     in the property? Does it make sense to have such a large solar PV system (5.6kWp)?
     #   - AIH001-04 why couldn't the cylinder be accessed? - treating this could get to the EPC C
-    #   - Generally, should we consider insulated doors?
+    #       - Potential measure - search for the cylinder and insulate it
     #   - AIH001-08 and AIH001-09, check if it's freehold - could solar work as both of these units are part of the same
-    #     buulding
-    #   - AIH001-09 - The extension is 1900-1929 but has a cavity wall
-    #   - AIH001-09 - Is it not possible to install a loft hatch?
-    #   - AIH001-09 - Why is there assumed secondary heating?
+    #     buulding [Question for Lewis & Kevin]
+    #   - AIH001-09 - Is it not possible to install a loft hatch? [IT IS NOT, NO ACCESS - would need to accessed from
+    #   the other unit]
+    #   - AIH001-09 - Why is there assumed secondary heating? [Question for Lewis & Kevin]
     #   - AIH001-09 - Is there definitely an immersion water heater? Is this definitely the case for the other units?
-    #   - AIH001-11 - The layout of this unit is confusing, is there roof access?
-    #   - AIH001-12 - Why was there not access to the cylinder?
-    #   - AIH001-12 - Is the need to draught proofing due to the windows?
-    #   - AIH001-04 - is the flat roof area correct?
+    #       [Question for Lewis & Kevin]
+    #   - AIH001-11 - The layout of this unit is confusing, is there roof access? [NO!!!! - It's a Sun room!!]
+    #   - AIH001-12 - Why was there not access to the cylinder? [Sealed shut]
+    #   - AIH001-12 - Is the need to draught proofing due to the windows? [This would be addressed by deailing with the
+    #                 windows]
 
     recommended_measures = [
         {
@@ -113,7 +114,7 @@ def main():
                 },
                 {
                     "measure": "Solar PV",
-                    "description": "4kWp Solar PV system",
+                    "description": "5.6kWp Solar PV system",
                     "config": [
                         {
                             "size": "4kWp",
@@ -497,6 +498,7 @@ def main():
         {'item': '100mm flat roof insulation', 'unit_price': 195, 'unit': 'floor_m2'},
         {'item': 'Switch to 24-hour tariff', 'unit_price': 0, 'unit': None},
         {'item': '3.2kWp Solar PV system', 'unit_price': 3686, 'unit': 'unit_needs_scaffolding'},
+        {'item': '5.6kWp Solar PV system', 'unit_price': 5015, 'unit': 'unit_needs_scaffolding'},
         {'item': 'Installation of double glazing', 'unit_price': 1074, 'unit': 'window'},
         {'item': 'Ecoforest ecoAIR EVI 4-20 20kW air source heat pump', 'unit_price': 21189, 'unit': 'unit'},
         {'item': '2kWp Solar PV system', 'unit_price': 3201, 'unit': 'unit_needs_scaffolding'},
@@ -505,7 +507,7 @@ def main():
         {'item': '4kWp Solar PV system', 'unit_price': 4009, 'unit': 'unit_needs_scaffolding'},
         {'item': '300mm loft insulation', 'unit_price': 16.07, 'unit': 'floor_m2'},
         {'item': 'Smart Thermostat', 'unit_price': 1200, 'unit': 'unit'},
-        {'item': '2x DMEV fans', 'unit_price': 1070, 'unit': 'unit'}
+        {'item': '2x DMEV fans', 'unit_price': 1070, 'unit': 'unit'},
     ]
     pricing_data = pd.DataFrame(pricing_data)
 
diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 777f96c5..62cec009 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -126,7 +126,10 @@ def extract_retrofit_pdfs(data_folder_path):
 
     for pdf_file in retrofit_files:
         pdf_path = os.path.join(data_folder_path, pdf_file)
-        return detect_and_parse_report(pdf_path, pdf_file)
+        extracted = detect_and_parse_report(pdf_path, pdf_file)
+        if extracted is not None:
+            return extracted
+        continue
 
     # If no relevant PDF is found, exit
     return None
@@ -165,10 +168,19 @@ def detect_and_parse_report(pdf_path, pdf_file):
         elif "summary" in pdf_file.lower():
             # Treat this as a Summary Report
             return extract_summary_report(pdf_path)
+        elif is_condition_report(first_page_text):
+            return None
         else:
             raise NotImplementedError("Implement me")
 
 
+def is_condition_report(text):
+    """
+    Determines if the provided text indicates that the PDF is a Condition Report.
+    """
+    return text.startswith("OsmosisACDNEWPAS2035ConditionReport")
+
+
 def main():
     """
     This code prepares the data for the Warm Homes: Social Housing Fund Wave 3, for Stonewater.
@@ -191,7 +203,7 @@ def main():
         if retrofit_folder:
             retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder)
             if os.listdir(retrofit_folder_path):  # If not empty
-                summary_data = extract_retrofit_pdfs(retrofit_folder_path)
+                summary_data = extract_retrofit_pdfs(data_folder_path=retrofit_folder_path)
                 if summary_data:
                     summary_data = {
                         "survey_folder": survey_folder,
@@ -204,7 +216,7 @@ def main():
                 continue
 
         # If no retrofit folder or it was empty, check files in survey_folder
-        summary_data = extract_retrofit_pdfs(survey_folder_path)
+        summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path)
         if summary_data:
             summary_data = {
                 "survey_folder": survey_folder,

From 54b09e88e15cfd6c824beff23f878525cb9d5d16 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 28 Oct 2024 17:20:05 +0000
Subject: [PATCH 22/59] added usage of is_summary_report

---
 etl/customers/stonewater/Wave 3 Preparation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 62cec009..988a544a 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -165,7 +165,7 @@ def detect_and_parse_report(pdf_path, pdf_file):
         if is_energy_report(first_page_text):
             # Treat this as an Energy Report
             return extract_epr(pdf_path)
-        elif "summary" in pdf_file.lower():
+        elif "summary" in pdf_file.lower() or is_summary_report(first_page_text):
             # Treat this as a Summary Report
             return extract_summary_report(pdf_path)
         elif is_condition_report(first_page_text):

From 6e8d9a025cc5b64c1a632bd9c95de140e9e58f82 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 28 Oct 2024 19:26:14 +0000
Subject: [PATCH 23/59] adjusting search epc function to handle pydantic issues
 for the moment

---
 .idea/Model.iml                               |   2 +-
 .idea/misc.xml                                |   2 +-
 backend/SearchEpc.py                          |  10 +-
 .../livewest/route_march_2024_10_28.py        | 171 ++++++++++++++++++
 .../stonewater/Wave 3 Preparation.py          |   2 +
 5 files changed, 178 insertions(+), 9 deletions(-)
 create mode 100644 etl/customers/livewest/route_march_2024_10_28.py

diff --git a/.idea/Model.iml b/.idea/Model.iml
index 762580d9..850c0cda 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -7,7 +7,7 @@
       <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
       <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
     </content>
-    <orderEntry type="jdk" jdkName="Stonewater-wave-3" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Engine" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
   <component name="PyNamespacePackagesService">
diff --git a/.idea/misc.xml b/.idea/misc.xml
index c916a158..e4070118 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -3,7 +3,7 @@
   <component name="Black">
     <option name="sdkName" value="Python 3.10 (backend)" />
   </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="Stonewater-wave-3" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Engine" project-jdk-type="Python SDK" />
   <component name="PyCharmProfessionalAdvertiser">
     <option name="shown" value="true" />
   </component>
diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py
index 367d8c85..f9e978c6 100644
--- a/backend/SearchEpc.py
+++ b/backend/SearchEpc.py
@@ -256,16 +256,12 @@ class SearchEpc:
             else:
                 params = {"address": self.address1, "postcode": self.postcode}
 
+        url = os.path.join(self.client.domestic.host, "search")
+
         for retry in range(self.max_retries):
             try:
 
-                if "uprn" in params:
-                    # We use the direct call method inside, since we need to implement uprn as a valid
-                    # parameter for the search function
-                    url = os.path.join(self.client.domestic.host, "search")
-                    response = self.client.domestic.call(method="get", url=url, params=params)
-                else:
-                    response = self.client.domestic.search(params=params, size=size)
+                response = self.client.domestic.call(method="get", url=url, params=params)
 
                 if response:
                     self.data = response
diff --git a/etl/customers/livewest/route_march_2024_10_28.py b/etl/customers/livewest/route_march_2024_10_28.py
new file mode 100644
index 00000000..fff1e7e7
--- /dev/null
+++ b/etl/customers/livewest/route_march_2024_10_28.py
@@ -0,0 +1,171 @@
+import os
+
+import pandas as pd
+from tqdm import tqdm
+
+from dotenv import load_dotenv
+from utils.s3 import read_excel_from_s3
+from backend.SearchEpc import SearchEpc
+from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
+
+from recommendations.recommendation_utils import (
+    estimate_perimeter,
+    estimate_external_wall_area,
+    estimate_number_of_floors
+)
+
+load_dotenv(dotenv_path="backend/.env")
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
+
+
+def app():
+    """
+    This app is EPC pulling data for some properties owned by Livewest
+
+    Data request contents:
+    Date of last EPC
+    Reason for EPC
+    SAP score on register
+    Property Type
+    Property Area
+    Property Age
+    Any Dimensions (HLP,PW,RH)
+    Property Wall Construction
+    Heating Type
+    Secondary Heating
+    Loft Insulation Depth
+
+    Additional if possible:
+    Heat loss calculations
+    EPC recommendations
+    Property UPRN
+
+    """
+    asset_list = pd.read_excel(
+        "/Users/khalimconn-kowlessar/Downloads/LIVEWEST 3578 ECO4 ECO PLUS GBIS.xlsx", header=0
+    )
+
+    epc_data = []
+    for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)):
+
+        postcode = home["Postcode"]
+        house_number = home["Number"]
+        full_address = home["Full Address"]
+
+        searcher = SearchEpc(
+            address1=str(house_number),
+            postcode=postcode,
+            auth_token=EPC_AUTH_TOKEN,
+            os_api_key="",
+            property_type=None,
+            fast=True,
+            full_address=full_address
+        )
+        # Force the skipping of estimating the EPC
+        searcher.ordnance_survey_client.property_type = None
+        searcher.ordnance_survey_client.built_form = None
+
+        searcher.find_property(skip_os=True)
+        if searcher.newest_epc is None:
+            continue
+
+        # Look for EPC recommendatons
+        try:
+            property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"])
+        except:
+            property_recommendations = {"rows": []}
+
+        epc = {
+            "asset_list_address": full_address,
+            **searcher.newest_epc.copy(),
+            "recommendations": property_recommendations["rows"]
+        }
+
+        epc_data.append(epc)
+
+    epc_df = pd.DataFrame(epc_data)
+
+    # Retrieve just the data we need
+    epc_df = epc_df[
+        [
+            "asset_list_address",
+            "uprn",
+            "property-type",
+            "built-form",
+            "inspection-date",
+            "current-energy-rating",
+            "current-energy-efficiency",
+            "roof-description",
+            "walls-description",
+            "transaction-type",
+            # New fields needed
+            "secondheat-description",
+            "total-floor-area",
+            "construction-age-band",
+            "floor-height",
+            "number-habitable-rooms",
+            "mainheat-description"
+            #
+            "energy-consumption-current",  # kwh/m2
+        ]
+    ]
+
+    asset_list = asset_list.merge(
+        epc_df,
+        how="left",
+        left_on=["ADDRESS"],
+        right_on=["asset_list_address"]
+    )
+
+    asset_list = asset_list.drop(columns=["asset_list_address"])
+
+    # Rename the columns
+    asset_list = asset_list.rename(columns={
+        "inspection-date": "Date of last EPC",
+        "current-energy-efficiency": "SAP score on register",
+        "current-energy-rating": "EPC rating on register",
+        "property-type": "Property Type",
+        "built-form": "Archetype",
+        "total-floor-area": "Property Floor Area",
+        "construction-age-band": "Property Age Band",
+        "floor-height": "Property Floor Height",
+        "number-habitable-rooms": "Number of Habitable Rooms",
+        "walls-description": "Wall Construction",
+        "roof-description": "Roof Construction",
+        "mainheat-description": "Heating Type",
+        "secondheat-description": "Secondary Heating",
+        "transaction-type": "Reason for last EPC"
+    })
+
+    asset_list["Estimated Number of Floors"] = asset_list.apply(
+        lambda x: estimate_number_of_floors(property_type=x["Property Type"]), axis=1
+    )
+
+    asset_list["Property Floor Area"] = asset_list["Property Floor Area"].astype(float)
+    asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].astype(float)
+
+    asset_list["Estimated Perimeter (m)"] = asset_list.apply(
+        lambda x: estimate_perimeter(
+            floor_area=x["Property Floor Area"] / x["Estimated Number of Floors"],
+            num_rooms=x["Number of Habitable Rooms"] / x["Estimated Number of Floors"],
+        ), axis=1
+    )
+
+    asset_list["Estimated Heat Loss Perimeter (m)"] = asset_list.apply(
+        lambda x: estimate_external_wall_area(
+            num_floors=x["Estimated Number of Floors"],
+            floor_height=float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5,
+            perimeter=x["Estimated Perimeter (m)"],
+            built_form=x["Archetype"]
+        ),
+        axis=1
+    )
+
+    asset_list["Roof Insulation Thickness"] = asset_list.apply(
+        lambda x: RoofAttributes(description=x["Roof Construction"]).process()["insulation_thickness"],
+        axis=1
+    )
+
+    # Store as an excel
+    filename = "LHP EPC Data pull.xlsx"
+    asset_list.to_excel(filename, index=False)
diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 988a544a..8e1a7fdb 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -226,5 +226,7 @@ def main():
 
     extracted_data = pd.DataFrame(extracted_data)
 
+    missed = [f for f in survey_folders if f not in extracted_data["survey_folder"].tolist()]
+
 # if __name__ == "__main__":
 #     main()

From 86ca5b40074015c20dd35fe38eda7ac3799139f4 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 28 Oct 2024 19:50:09 +0000
Subject: [PATCH 24/59] addded catch for condition report

---
 .idea/Model.iml                               |  2 +-
 .idea/misc.xml                                |  2 +-
 .../livewest/route_march_2024_10_28.py        | 69 ++++++++++---------
 .../stonewater/Wave 3 Preparation.py          |  2 +-
 4 files changed, 40 insertions(+), 35 deletions(-)

diff --git a/.idea/Model.iml b/.idea/Model.iml
index 850c0cda..762580d9 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -7,7 +7,7 @@
       <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
       <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
     </content>
-    <orderEntry type="jdk" jdkName="Engine" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Stonewater-wave-3" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
   <component name="PyNamespacePackagesService">
diff --git a/.idea/misc.xml b/.idea/misc.xml
index e4070118..c916a158 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -3,7 +3,7 @@
   <component name="Black">
     <option name="sdkName" value="Python 3.10 (backend)" />
   </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="Engine" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Stonewater-wave-3" project-jdk-type="Python SDK" />
   <component name="PyCharmProfessionalAdvertiser">
     <option name="shown" value="true" />
   </component>
diff --git a/etl/customers/livewest/route_march_2024_10_28.py b/etl/customers/livewest/route_march_2024_10_28.py
index fff1e7e7..47b86e89 100644
--- a/etl/customers/livewest/route_march_2024_10_28.py
+++ b/etl/customers/livewest/route_march_2024_10_28.py
@@ -1,4 +1,5 @@
 import os
+import time
 
 import pandas as pd
 from tqdm import tqdm
@@ -46,42 +47,46 @@ def app():
     )
 
     epc_data = []
+    errors = []
     for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)):
-
-        postcode = home["Postcode"]
-        house_number = home["Number"]
-        full_address = home["Full Address"]
-
-        searcher = SearchEpc(
-            address1=str(house_number),
-            postcode=postcode,
-            auth_token=EPC_AUTH_TOKEN,
-            os_api_key="",
-            property_type=None,
-            fast=True,
-            full_address=full_address
-        )
-        # Force the skipping of estimating the EPC
-        searcher.ordnance_survey_client.property_type = None
-        searcher.ordnance_survey_client.built_form = None
-
-        searcher.find_property(skip_os=True)
-        if searcher.newest_epc is None:
-            continue
-
-        # Look for EPC recommendatons
         try:
-            property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"])
-        except:
-            property_recommendations = {"rows": []}
+            postcode = home["Postcode"]
+            house_number = home["Number"]
+            full_address = home["Full Address"]
 
-        epc = {
-            "asset_list_address": full_address,
-            **searcher.newest_epc.copy(),
-            "recommendations": property_recommendations["rows"]
-        }
+            searcher = SearchEpc(
+                address1=str(house_number),
+                postcode=postcode,
+                auth_token=EPC_AUTH_TOKEN,
+                os_api_key="",
+                property_type=None,
+                fast=True,
+                full_address=full_address
+            )
+            # Force the skipping of estimating the EPC
+            searcher.ordnance_survey_client.property_type = None
+            searcher.ordnance_survey_client.built_form = None
 
-        epc_data.append(epc)
+            searcher.find_property(skip_os=True)
+            if searcher.newest_epc is None:
+                continue
+
+            # Look for EPC recommendatons
+            try:
+                property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"])
+            except:
+                property_recommendations = {"rows": []}
+
+            epc = {
+                "asset_list_address": full_address,
+                **searcher.newest_epc.copy(),
+                "recommendations": property_recommendations["rows"]
+            }
+
+            epc_data.append(epc)
+        except Exception as e:
+            errors.append(e)
+            time.sleep(5)
 
     epc_df = pd.DataFrame(epc_data)
 
diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 8e1a7fdb..fc11f1c0 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -178,7 +178,7 @@ def is_condition_report(text):
     """
     Determines if the provided text indicates that the PDF is a Condition Report.
     """
-    return text.startswith("OsmosisACDNEWPAS2035ConditionReport")
+    return text.startswith("OsmosisACDNEWPAS2035ConditionReport") or text.startswith("OsmosisACDPAS2035ConditionReport")
 
 
 def main():

From 8bf5b23410caccce29ddfaaf30953c1b48db4c7d Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 28 Oct 2024 20:29:31 +0000
Subject: [PATCH 25/59] handling extraction of windows data

---
 .../livewest/route_march_2024_10_28.py        |  3 +-
 .../stonewater/Wave 3 Preparation.py          | 58 +++++++++++++++++++
 2 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/etl/customers/livewest/route_march_2024_10_28.py b/etl/customers/livewest/route_march_2024_10_28.py
index 47b86e89..c19c78b1 100644
--- a/etl/customers/livewest/route_march_2024_10_28.py
+++ b/etl/customers/livewest/route_march_2024_10_28.py
@@ -61,7 +61,8 @@ def app():
                 os_api_key="",
                 property_type=None,
                 fast=True,
-                full_address=full_address
+                full_address=full_address,
+                max_retries=3
             )
             # Force the skipping of estimating the EPC
             searcher.ordnance_survey_client.property_type = None
diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index fc11f1c0..a8e06416 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -3,6 +3,7 @@ import PyPDF2
 import re
 import pandas as pd
 from tqdm import tqdm
+from collections import Counter
 
 FILE_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 3 Surveys"
 
@@ -19,6 +20,8 @@ def extract_summary_report(pdf_path):
         "Address": None,
         "Current SAP Rating": None,
         "Fuel Bill": None,
+        "Window Age Description": None,
+        "Window Age Description Proportion (%)": None,
     }
 
     with open(pdf_path, "rb") as file:
@@ -61,9 +64,56 @@ def extract_summary_report(pdf_path):
         # Join non-empty parts with a comma
         data["Address"] = ", ".join([part for part in address_parts if part])
 
+        windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL)
+        windows_text = windows_section.group(1)
+        window_data = extract_window_age_description(windows_text)
+        data.update(window_data)
+
     return data
 
 
+def extract_window_age_description(windows_text):
+    """
+    Extracts the most common window age description and its proportion.
+
+    Parameters:
+        windows_text (str): The text section containing window data.
+
+    Returns:
+        dict: A dictionary with the most common window age description and its proportion.
+    """
+    # Clean up windows_text by removing line breaks for better pattern matching
+    windows_text = windows_text.replace("\n", "")
+
+    # Define possible window age descriptions
+    window_descriptions = [
+        "Double post or during 2002",
+        "Double pre 2002",
+        "Double with unknown install date",
+        "Secondary glazing",
+        "Triple glazing",
+        "Single glazing",
+    ]
+
+    # Count occurrences of each description
+    description_counts = Counter()
+    for description in window_descriptions:
+        matches = re.findall(re.escape(description), windows_text)
+        description_counts[description] = len(matches)
+
+    if not description_counts or not sum(description_counts.values()):
+        raise ValueError("Failed to extract window data.")
+
+    # Determine the most common description and calculate its proportion
+    most_common_description, window_count = description_counts.most_common(1)[0]
+    window_proportion = window_count / sum(description_counts.values()) * 100
+
+    return {
+        "Window Age Description": most_common_description,
+        "Window Age Description Proportion (%)": window_proportion
+    }
+
+
 def extract_epr(pdf_path):
     """
     Extracts specific data from an Energy Report (EPR) PDF file.
@@ -74,6 +124,8 @@ def extract_epr(pdf_path):
         "Space Heating": None,
         "Water Heating": None,
         "Fuel Bill": None,
+        "Window Age Description": None,
+        "Window Age Description Proportion (%)": None,
     }
 
     with open(pdf_path, "rb") as file:
@@ -115,6 +167,12 @@ def extract_epr(pdf_path):
         fuel_bill_match = re.search(r"TOTAL\s*£(\d+)", text)
         data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}"
 
+        # Extract the windows data
+        windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL)
+        windows_text = windows_section.group(1)
+        window_data = extract_window_age_description(windows_text)
+        data.update(window_data)
+
     return data
 
 

From e22baed16fcf6ce86e38266d557aab3cc529953d Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 29 Oct 2024 12:29:24 +0000
Subject: [PATCH 26/59] sorted livewest data pull

---
 .idea/Model.iml                               |   2 +-
 .idea/misc.xml                                |   2 +-
 .../livewest/route_march_2024_10_28.py        | 148 ++++++++++++------
 .../stonewater/Wave 3 Preparation.py          |   2 +
 4 files changed, 102 insertions(+), 52 deletions(-)

diff --git a/.idea/Model.iml b/.idea/Model.iml
index 762580d9..850c0cda 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -7,7 +7,7 @@
       <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
       <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
     </content>
-    <orderEntry type="jdk" jdkName="Stonewater-wave-3" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Engine" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
   <component name="PyNamespacePackagesService">
diff --git a/.idea/misc.xml b/.idea/misc.xml
index c916a158..e4070118 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -3,7 +3,7 @@
   <component name="Black">
     <option name="sdkName" value="Python 3.10 (backend)" />
   </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="Stonewater-wave-3" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Engine" project-jdk-type="Python SDK" />
   <component name="PyCharmProfessionalAdvertiser">
     <option name="shown" value="true" />
   </component>
diff --git a/etl/customers/livewest/route_march_2024_10_28.py b/etl/customers/livewest/route_march_2024_10_28.py
index c19c78b1..1b259fba 100644
--- a/etl/customers/livewest/route_march_2024_10_28.py
+++ b/etl/customers/livewest/route_march_2024_10_28.py
@@ -19,6 +19,53 @@ load_dotenv(dotenv_path="backend/.env")
 EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
 
 
+def get_data(asset_list):
+    epc_data = []
+    errors = []
+    for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)):
+        try:
+            postcode = home["Postcode"]
+            house_number = home["Number"]
+            full_address = home["Full Address"]
+
+            searcher = SearchEpc(
+                address1=str(house_number),
+                postcode=postcode,
+                auth_token=EPC_AUTH_TOKEN,
+                os_api_key="",
+                property_type=None,
+                fast=True,
+                full_address=full_address,
+                max_retries=5
+            )
+            # Force the skipping of estimating the EPC
+            searcher.ordnance_survey_client.property_type = None
+            searcher.ordnance_survey_client.built_form = None
+
+            searcher.find_property(skip_os=True)
+            if searcher.newest_epc is None:
+                continue
+
+            # Look for EPC recommendatons
+            try:
+                property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"])
+            except:
+                property_recommendations = {"rows": []}
+
+            epc = {
+                "row_id": home["row_id"],
+                **searcher.newest_epc.copy(),
+                "recommendations": property_recommendations["rows"]
+            }
+
+            epc_data.append(epc)
+        except Exception as e:
+            errors.append(home["row_id"])
+            time.sleep(5)
+
+    return epc_data, errors
+
+
 def app():
     """
     This app is EPC pulling data for some properties owned by Livewest
@@ -45,56 +92,49 @@ def app():
     asset_list = pd.read_excel(
         "/Users/khalimconn-kowlessar/Downloads/LIVEWEST 3578 ECO4 ECO PLUS GBIS.xlsx", header=0
     )
+    asset_list["row_id"] = asset_list.index
 
-    epc_data = []
-    errors = []
-    for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)):
-        try:
-            postcode = home["Postcode"]
-            house_number = home["Number"]
-            full_address = home["Full Address"]
+    epc_data, errors = get_data(asset_list)
 
-            searcher = SearchEpc(
-                address1=str(house_number),
-                postcode=postcode,
-                auth_token=EPC_AUTH_TOKEN,
-                os_api_key="",
-                property_type=None,
-                fast=True,
-                full_address=full_address,
-                max_retries=3
-            )
-            # Force the skipping of estimating the EPC
-            searcher.ordnance_survey_client.property_type = None
-            searcher.ordnance_survey_client.built_form = None
+    # We now retrieve any failed properties
+    asset_list_failed = asset_list[asset_list["row_id"].isin(errors)]
+    epc_data_failed, _ = get_data(asset_list_failed)
 
-            searcher.find_property(skip_os=True)
-            if searcher.newest_epc is None:
-                continue
-
-            # Look for EPC recommendatons
-            try:
-                property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"])
-            except:
-                property_recommendations = {"rows": []}
-
-            epc = {
-                "asset_list_address": full_address,
-                **searcher.newest_epc.copy(),
-                "recommendations": property_recommendations["rows"]
-            }
-
-            epc_data.append(epc)
-        except Exception as e:
-            errors.append(e)
-            time.sleep(5)
+    # Append the failed data to the main data
+    epc_data.extend(epc_data_failed)
 
     epc_df = pd.DataFrame(epc_data)
 
+    # We expand out the recommendations
+    recommendations_df = epc_df[["row_id", "recommendations"]]
+
+    unique_recommendations = set()
+    for _, row in recommendations_df.iterrows():
+        unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]])
+
+    columns = ["row_id"] + list(unique_recommendations)
+    transformed_data = []
+    for _, row in recommendations_df.iterrows():
+        # Initialize a dictionary for this row with False for all recommendations
+        row_data = {col: False for col in columns}
+        row_data["row_id"] = row["row_id"]
+
+        # Set True for each recommendation present in this row
+        for rec in row["recommendations"]:
+            recommendation_text = rec["improvement-summary-text"]
+            row_data[recommendation_text] = True
+
+        # Append the row data to transformed_data
+        transformed_data.append(row_data)
+
+    transformed_df = pd.DataFrame(transformed_data)
+    # Drop the column that is ""
+    transformed_df = transformed_df.drop(columns=[""])
+
     # Retrieve just the data we need
     epc_df = epc_df[
         [
-            "asset_list_address",
+            "row_id",
             "uprn",
             "property-type",
             "built-form",
@@ -110,7 +150,7 @@ def app():
             "construction-age-band",
             "floor-height",
             "number-habitable-rooms",
-            "mainheat-description"
+            "mainheat-description",
             #
             "energy-consumption-current",  # kwh/m2
         ]
@@ -119,11 +159,14 @@ def app():
     asset_list = asset_list.merge(
         epc_df,
         how="left",
-        left_on=["ADDRESS"],
-        right_on=["asset_list_address"]
+        on="row_id"
+    ).merge(
+        transformed_df,
+        how="left",
+        on="row_id"
     )
 
-    asset_list = asset_list.drop(columns=["asset_list_address"])
+    asset_list = asset_list.drop(columns=["row_id"])
 
     # Rename the columns
     asset_list = asset_list.rename(columns={
@@ -140,14 +183,18 @@ def app():
         "roof-description": "Roof Construction",
         "mainheat-description": "Heating Type",
         "secondheat-description": "Secondary Heating",
-        "transaction-type": "Reason for last EPC"
+        "transaction-type": "Reason for last EPC",
+        "energy-consumption-current": "Heat Demand (kWh/m2)"
     })
 
     asset_list["Estimated Number of Floors"] = asset_list.apply(
-        lambda x: estimate_number_of_floors(property_type=x["Property Type"]), axis=1
+        lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull(
+            x["Property Type"]) else None, axis=1
     )
 
     asset_list["Property Floor Area"] = asset_list["Property Floor Area"].astype(float)
+    # Replace "" value with None
+    asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].replace("", None)
     asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].astype(float)
 
     asset_list["Estimated Perimeter (m)"] = asset_list.apply(
@@ -157,7 +204,7 @@ def app():
         ), axis=1
     )
 
-    asset_list["Estimated Heat Loss Perimeter (m)"] = asset_list.apply(
+    asset_list["Estimated Heat Loss Perimeter (m2)"] = asset_list.apply(
         lambda x: estimate_external_wall_area(
             num_floors=x["Estimated Number of Floors"],
             floor_height=float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5,
@@ -168,10 +215,11 @@ def app():
     )
 
     asset_list["Roof Insulation Thickness"] = asset_list.apply(
-        lambda x: RoofAttributes(description=x["Roof Construction"]).process()["insulation_thickness"],
+        lambda x: RoofAttributes(description=x["Roof Construction"]).process()["insulation_thickness"] if not pd.isnull(
+            x["Roof Construction"]) else None,
         axis=1
     )
 
     # Store as an excel
-    filename = "LHP EPC Data pull.xlsx"
+    filename = "livewest EPC Data pull - 29 Oct.xlsx"
     asset_list.to_excel(filename, index=False)
diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index a8e06416..d8d01b22 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -283,6 +283,8 @@ def main():
             extracted_data.append(summary_data)
 
     extracted_data = pd.DataFrame(extracted_data)
+    # Save this as a csv
+    # extracted_data.to_csv("Wave 3 Summary Data - first 200 files.csv", index=False)
 
     missed = [f for f in survey_folders if f not in extracted_data["survey_folder"].tolist()]
 

From b7f402ba9d699ede3693068f8bec9e2087c0a8aa Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 29 Oct 2024 13:55:18 +0000
Subject: [PATCH 27/59] addded # Storeys

---
 .idea/Model.iml                                |  2 +-
 .idea/misc.xml                                 |  2 +-
 etl/customers/stonewater/Wave 3 Preparation.py | 11 +++++++----
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/.idea/Model.iml b/.idea/Model.iml
index 850c0cda..762580d9 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -7,7 +7,7 @@
       <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
       <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
     </content>
-    <orderEntry type="jdk" jdkName="Engine" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Stonewater-wave-3" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
   <component name="PyNamespacePackagesService">
diff --git a/.idea/misc.xml b/.idea/misc.xml
index e4070118..c916a158 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -3,7 +3,7 @@
   <component name="Black">
     <option name="sdkName" value="Python 3.10 (backend)" />
   </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="Engine" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Stonewater-wave-3" project-jdk-type="Python SDK" />
   <component name="PyCharmProfessionalAdvertiser">
     <option name="shown" value="true" />
   </component>
diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index d8d01b22..b1b48cec 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -19,6 +19,7 @@ def extract_summary_report(pdf_path):
     data = {
         "Address": None,
         "Current SAP Rating": None,
+        "Number of Storeys": None,
         "Fuel Bill": None,
         "Window Age Description": None,
         "Window Age Description Proportion (%)": None,
@@ -32,13 +33,15 @@ def extract_summary_report(pdf_path):
 
         # Extract Current SAP rating
         sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text)
-        if sap_match:
-            data["Current SAP Rating"] = sap_match.group(1)
+        data["Current SAP Rating"] = sap_match.group(1)
+
+        # Number of storeys
+        storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text)
+        data["Number of Storeys"] = int(storeys_match.group(1))
 
         # Extract Fuel Bill
         fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text)
-        if fuel_bill_match:
-            data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}"
+        data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}"
 
         # Extract individual address components
         postcode = re.search(r"Postcode:\s*(.*?)\nRegion:", text)

From 753bda6cb0bc4c8de266944c04ab99db7d74da3d Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 29 Oct 2024 14:21:01 +0000
Subject: [PATCH 28/59] extracting heating systems from summary report

---
 .../stonewater/Wave 3 Preparation.py          | 86 ++++++++++++++++++-
 1 file changed, 84 insertions(+), 2 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index b1b48cec..863a6a6c 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -19,10 +19,26 @@ def extract_summary_report(pdf_path):
     data = {
         "Address": None,
         "Current SAP Rating": None,
-        "Number of Storeys": None,
+        "Space Heating": None,
+        "Water Heating": None,
         "Fuel Bill": None,
         "Window Age Description": None,
         "Window Age Description Proportion (%)": None,
+        "Secondary Window Age Description": None,
+        "Secondary Window Age Description Proportion (%)": None,
+        "Number of Windows": None,
+        "Total Number of Doors": None,
+        "Number of Insulated Doors": None,
+        "Existing Primary Heating System": None,
+        "Existing Primary Heating PCDF Reference": None,
+        "Existing Primary Heating Controls": None,
+        "Existing Primary Heating % of Heat": None,
+        "Existing Secondary Heating System": None,
+        "Existing Secondary Heating PCDF Reference": None,
+        "Existing Secondary Heating Controls": None,
+        "Existing Secondary Heating % of Heat": None,
+        "Secondary Heating Code": None,
+        "Water Heating Code": None,
     }
 
     with open(pdf_path, "rb") as file:
@@ -39,6 +55,10 @@ def extract_summary_report(pdf_path):
         storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text)
         data["Number of Storeys"] = int(storeys_match.group(1))
 
+        # Extract Carbon Emissions
+        carbon_match = re.search(r"Emissions \(t/year\):\s*([\d.]+)\s*tonnes", text)
+        data["Carbon Emissions (t/year)"] = float(carbon_match.group(1))
+
         # Extract Fuel Bill
         fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text)
         data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}"
@@ -66,12 +86,58 @@ def extract_summary_report(pdf_path):
 
         # Join non-empty parts with a comma
         data["Address"] = ", ".join([part for part in address_parts if part])
+        data["Postcode"] = postcode.group(1).strip()
 
         windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL)
         windows_text = windows_section.group(1)
         window_data = extract_window_age_description(windows_text)
         data.update(window_data)
 
+        # Extract Total Number of Doors
+        total_doors_match = re.search(r"Total Number of Doors\s*(\d+)", text)
+        data["Total Number of Doors"] = int(total_doors_match.group(1))
+
+        # Extract Number of Insulated Doors
+        insulated_doors_match = re.search(r"Number of Insulated Doors\s*(\d+)", text)
+        data["Number of Insulated Doors"] = int(insulated_doors_match.group(1))
+
+        # Extract heating system
+        # Extract Primary Heating Data
+        # Extract Primary Heating Section
+        primary_heating_section = re.search(r"Main\s*Heating1\s*(.*?)\s*Main\s*Heating2", text, re.DOTALL)
+        primary_text = primary_heating_section.group(1)
+
+        data["Existing Primary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", primary_text).group(
+            1).strip()
+        data["Existing Primary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)",
+                                                                    primary_text).group(1)
+        data["Existing Primary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n", primary_text).group(
+            1).strip()
+        data["Existing Primary Heating % of Heat"] = int(
+            re.search(r"Percentage of Heat\s*(\d+)\s*%", primary_text).group(1)
+        )
+
+        # Extract Secondary Heating Section
+        secondary_heating_section = re.search(r"Main\s*Heating2\s*(.*?)\s*Water\s*Heating", text, re.DOTALL)
+        secondary_text = secondary_heating_section.group(1)
+
+        data["Existing Secondary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", secondary_text).group(
+            1).strip()
+        data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)",
+                                                                      secondary_text).group(1)
+        data["Existing Secondary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n",
+                                                                secondary_text).group(1).strip()
+        data["Existing Secondary Heating % of Heat"] = int(
+            re.search(r"Percentage of Heat\s*(\d+)\s*%", secondary_text).group(1)
+        )
+
+        # Extract Secondary Heating and Water Heating Codes
+        secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text)
+        water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text)
+
+        data["Secondary Heating Code"] = secondary_heating_code_match.group(1).strip()
+        data["Water Heating Code"] = water_heating_code_match.group(1).strip()
+
     return data
 
 
@@ -111,9 +177,20 @@ def extract_window_age_description(windows_text):
     most_common_description, window_count = description_counts.most_common(1)[0]
     window_proportion = window_count / sum(description_counts.values()) * 100
 
+    # Get the second most common and the proportion
+    if window_proportion == 100:
+        second_most_common_description = None
+        second_most_common_proportion = 0
+    else:
+        second_most_common_description, second_window_count = description_counts.most_common(2)[1]
+        second_most_common_proportion = second_window_count / sum(description_counts.values()) * 100
+
     return {
         "Window Age Description": most_common_description,
-        "Window Age Description Proportion (%)": window_proportion
+        "Window Age Description Proportion (%)": window_proportion,
+        "Secondary Window Age Description": second_most_common_description,
+        "Secondary Window Age Description Proportion (%)": second_most_common_proportion,
+        "Number of Windows": sum(description_counts.values())
     }
 
 
@@ -129,6 +206,11 @@ def extract_epr(pdf_path):
         "Fuel Bill": None,
         "Window Age Description": None,
         "Window Age Description Proportion (%)": None,
+        "Secondary Window Age Description": None,
+        "Secondary Window Age Description Proportion (%)": None,
+        "Number of Windows": None,
+        "Total Number of Doors": None,
+        "Number of Insulated Doors": None,
     }
 
     with open(pdf_path, "rb") as file:

From 364b5b07e8f1ff29b3da3625014e4250fc5954ce Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 29 Oct 2024 14:46:01 +0000
Subject: [PATCH 29/59] adding to extract eprs

---
 .../stonewater/Wave 3 Preparation.py          | 101 +++++++++++++-----
 1 file changed, 73 insertions(+), 28 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 863a6a6c..4ab33732 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -18,6 +18,7 @@ def extract_summary_report(pdf_path):
     """
     data = {
         "Address": None,
+        "Postcode": None,
         "Current SAP Rating": None,
         "Space Heating": None,
         "Water Heating": None,
@@ -200,7 +201,9 @@ def extract_epr(pdf_path):
     """
     data = {
         "Address": None,
+        "Postcode": None,
         "Current SAP Rating": None,
+        "Potential SAP Rating": None,
         "Space Heating": None,
         "Water Heating": None,
         "Fuel Bill": None,
@@ -211,6 +214,16 @@ def extract_epr(pdf_path):
         "Number of Windows": None,
         "Total Number of Doors": None,
         "Number of Insulated Doors": None,
+        "Existing Primary Heating System": None,
+        "Existing Primary Heating PCDF Reference": None,
+        "Existing Primary Heating Controls": None,
+        "Existing Primary Heating % of Heat": None,
+        "Existing Secondary Heating System": None,
+        "Existing Secondary Heating PCDF Reference": None,
+        "Existing Secondary Heating Controls": None,
+        "Existing Secondary Heating % of Heat": None,
+        "Secondary Heating Code": None,
+        "Water Heating Code": None,
     }
 
     with open(pdf_path, "rb") as file:
@@ -222,41 +235,73 @@ def extract_epr(pdf_path):
         # Extract Address
         address_match = re.search(r"ENERGY REPORT\nDwelling Address\s*(.*?)\s*\nReference", text, re.DOTALL)
         data["Address"] = address_match.group(1).strip()
+        data["Postcode"] = data["Address"].split(",")[-1].strip()
 
-        # Extract Total Floor Area
-        # area_match = re.search(r"Total Floor Area\s*(\d+ m2)", text)
-        # data["Total Floor Area"] = area_match.group(1)
-
-        # Extract Current SAP rating
-        # Updated Regular Expression to find "GG (1-20)" followed by two numbers
+        # Extract Current and Potential SAP ratings
         sap_match = re.search(r"GG \(1-20\)(\d{1,2})(\d{1,2})", text)
+        current_sap, _ = int(sap_match.group(1)), int(sap_match.group(2))
+        data["Current SAP Rating"] = current_sap
 
-        # Extract and validate the Current and Potential SAP ratings
-        current_sap, potential_sap = int(sap_match.group(1)), int(sap_match.group(2))
-        # Ensure potential is greater than or equal to current
-        if 1 <= current_sap <= 99 and 1 <= potential_sap <= 99 and potential_sap >= current_sap:
-            data["Current SAP Rating"] = current_sap
-            data["Potential SAP Rating"] = potential_sap
-        else:
-            raise ValueError("Failed to parse SAP ratings correctly due to unexpected format.")
-
-        # Extract Space Heating (kWh)
-        space_heating_match = re.search(r"Space Heating\s+(\d+)\s+kWh", text)
-        data["Space Heating"] = int(space_heating_match.group(1))
-
-        # Extract Water Heating (kWh)
-        water_heating_match = re.search(r"Water Heating\s+(\d+)\s+kWh", text)
-        data["Water Heating"] = int(water_heating_match.group(1))
-
-        # Extract Fuel Bill (total estimated costs)
+        # Extract Fuel Bill
         fuel_bill_match = re.search(r"TOTAL\s*£(\d+)", text)
         data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}"
 
-        # Extract the windows data
+        # Extract Total Number of Doors
+        total_doors_match = re.search(r"Total Doors:\s*(\d+)", text)
+        data["Total Number of Doors"] = int(total_doors_match.group(1))
+
+        # Extract Number of Insulated Doors
+        insulated_doors_match = re.search(r"Insulated Doors:\s*(\d+)", text)
+        data["Number of Insulated Doors"] = int(insulated_doors_match.group(1))
+
+        # Extract Primary Heating Section (Main Heating 1)
+        primary_heating_section = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Main\s*Heating\s*2", text, re.DOTALL)
+        primary_text = primary_heating_section.group(1)
+
+        data["Existing Primary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", primary_text).group(
+            1).strip()
+        data["Existing Primary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)",
+                                                                    primary_text).group(1)
+        data["Existing Primary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n", primary_text).group(
+            1).strip()
+        data["Existing Primary Heating % of Heat"] = int(
+            re.search(r"Percentage of Heat\s*(\d+)\s*%?", primary_text).group(1)
+        )
+
+        # Extract Secondary Heating Section (Main Heating 2)
+        secondary_heating_section = re.search(r"Main\s*Heating\s*2\s*(.*?)\s*Secondary Heating", text, re.DOTALL)
+        secondary_text = secondary_heating_section.group(1)
+
+        data["Existing Secondary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", secondary_text).group(
+            1).strip()
+        data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)",
+                                                                      secondary_text).group(1)
+
+        if data["Existing Secondary Heating System"] == "":
+            data["Existing Secondary Heating Controls"] = ""
+        else:
+            data["Existing Secondary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n",
+                                                                    secondary_text).group(1).strip()
+        data["Existing Secondary Heating % of Heat"] = int(
+            re.search(r"Percentage of Heat\s*(\d+)\s*%?", secondary_text).group(1)
+        )
+
+        # Extract Secondary Heating and Water Heating Codes
+        secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text)
+        water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text)
+
+        if data["Existing Secondary Heating System"] == "":
+            data["Secondary Heating Code"] = ""
+        else:
+            data["Secondary Heating Code"] = secondary_heating_code_match.group(1).strip()
+        data["Water Heating Code"] = water_heating_code_match.group(1).strip()
+
+        # Extract Windows information
         windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL)
-        windows_text = windows_section.group(1)
-        window_data = extract_window_age_description(windows_text)
-        data.update(window_data)
+        if windows_section:
+            windows_text = windows_section.group(1)
+            window_data = extract_window_age_description(windows_text)
+            data.update(window_data)
 
     return data
 

From 9eb4720c91d22ed2084364d92a0c99cbb3088adc Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 29 Oct 2024 14:54:19 +0000
Subject: [PATCH 30/59] added peui

---
 etl/customers/stonewater/Wave 3 Preparation.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 4ab33732..1b7b1bcd 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -20,9 +20,8 @@ def extract_summary_report(pdf_path):
         "Address": None,
         "Postcode": None,
         "Current SAP Rating": None,
-        "Space Heating": None,
-        "Water Heating": None,
         "Fuel Bill": None,
+        "Number of Storeys": None,
         "Window Age Description": None,
         "Window Age Description Proportion (%)": None,
         "Secondary Window Age Description": None,
@@ -203,9 +202,8 @@ def extract_epr(pdf_path):
         "Address": None,
         "Postcode": None,
         "Current SAP Rating": None,
-        "Potential SAP Rating": None,
-        "Space Heating": None,
-        "Water Heating": None,
+        "Primary Energy Use Intensity (kWh/m2/yr)": None,
+        "Number of Storeys": None,
         "Fuel Bill": None,
         "Window Age Description": None,
         "Window Age Description Proportion (%)": None,
@@ -242,6 +240,14 @@ def extract_epr(pdf_path):
         current_sap, _ = int(sap_match.group(1)), int(sap_match.group(2))
         data["Current SAP Rating"] = current_sap
 
+        # Extract the primary energy use intensity
+        additional_rating_match = re.search(r"Additional ratings for your home\s*([\d.]+)", text)
+        data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(additional_rating_match.group(1))
+
+        # Extract Number of Storeys
+        storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text)
+        data["Number of Storeys"] = int(storeys_match.group(1))
+
         # Extract Fuel Bill
         fuel_bill_match = re.search(r"TOTAL\s*£(\d+)", text)
         data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}"

From b74b8823d18d428888fd832c515cc81cb2c6bdf1 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 29 Oct 2024 14:59:32 +0000
Subject: [PATCH 31/59] fixing bug extracting from epr

---
 .../stonewater/Wave 3 Preparation.py          | 54 ++++++++++++-------
 1 file changed, 34 insertions(+), 20 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 1b7b1bcd..02a5cd83 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -261,36 +261,50 @@ def extract_epr(pdf_path):
         data["Number of Insulated Doors"] = int(insulated_doors_match.group(1))
 
         # Extract Primary Heating Section (Main Heating 1)
-        primary_heating_section = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Main\s*Heating\s*2", text, re.DOTALL)
+        primary_heating_section1 = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Main\s*Heating\s*2", text, re.DOTALL)
+        # We may not have a secondary heating
+        primary_heating_section2 = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Secondary\s*Heating", text, re.DOTALL)
+        primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2
         primary_text = primary_heating_section.group(1)
 
-        data["Existing Primary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", primary_text).group(
-            1).strip()
-        data["Existing Primary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)",
-                                                                    primary_text).group(1)
-        data["Existing Primary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n", primary_text).group(
-            1).strip()
+        data["Existing Primary Heating System"] = re.search(
+            r"Main Heating Code\s*(.*?)\n", primary_text
+        ).group(1).strip()
+        data["Existing Primary Heating PCDF Reference"] = re.search(
+            r"PCDF boiler Reference\s*(\d+)", primary_text
+        ).group(1)
+        data["Existing Primary Heating Controls"] = re.search(
+            r"Main Heating Controls\s*(.*?)\n", primary_text
+        ).group(1).strip()
         data["Existing Primary Heating % of Heat"] = int(
             re.search(r"Percentage of Heat\s*(\d+)\s*%?", primary_text).group(1)
         )
 
         # Extract Secondary Heating Section (Main Heating 2)
         secondary_heating_section = re.search(r"Main\s*Heating\s*2\s*(.*?)\s*Secondary Heating", text, re.DOTALL)
-        secondary_text = secondary_heating_section.group(1)
-
-        data["Existing Secondary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", secondary_text).group(
-            1).strip()
-        data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)",
-                                                                      secondary_text).group(1)
-
-        if data["Existing Secondary Heating System"] == "":
+        if secondary_heating_section is None:
+            data["Existing Secondary Heating System"] = ""
+            data["Existing Secondary Heating PCDF Reference"] = ""
             data["Existing Secondary Heating Controls"] = ""
+            data["Existing Secondary Heating % of Heat"] = 0
+            
         else:
-            data["Existing Secondary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n",
-                                                                    secondary_text).group(1).strip()
-        data["Existing Secondary Heating % of Heat"] = int(
-            re.search(r"Percentage of Heat\s*(\d+)\s*%?", secondary_text).group(1)
-        )
+            secondary_text = secondary_heating_section.group(1)
+
+            data["Existing Secondary Heating System"] = re.search(
+                r"Main Heating Code\s*(.*?)\n", secondary_text
+            ).group(1).strip()
+            data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)",
+                                                                          secondary_text).group(1)
+
+            if data["Existing Secondary Heating System"] == "":
+                data["Existing Secondary Heating Controls"] = ""
+            else:
+                data["Existing Secondary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n",
+                                                                        secondary_text).group(1).strip()
+            data["Existing Secondary Heating % of Heat"] = int(
+                re.search(r"Percentage of Heat\s*(\d+)\s*%?", secondary_text).group(1)
+            )
 
         # Extract Secondary Heating and Water Heating Codes
         secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text)

From 9e752fca8db65d829cdac4ff15fc874fd086ad6d Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 29 Oct 2024 15:03:23 +0000
Subject: [PATCH 32/59] handling edge case extracting from summary report

---
 etl/customers/stonewater/Wave 3 Preparation.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 02a5cd83..0af43310 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -109,10 +109,12 @@ def extract_summary_report(pdf_path):
 
         data["Existing Primary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", primary_text).group(
             1).strip()
-        data["Existing Primary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)",
-                                                                    primary_text).group(1)
-        data["Existing Primary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n", primary_text).group(
-            1).strip()
+        data["Existing Primary Heating PCDF Reference"] = re.search(
+            r"PCDF boiler Reference\s*(\d+)", primary_text
+        ).group(1)
+        data["Existing Primary Heating Controls"] = re.search(
+            r"Main Heating Controls\s*(.*?)\n", primary_text
+        ).group(1).strip()
         data["Existing Primary Heating % of Heat"] = int(
             re.search(r"Percentage of Heat\s*(\d+)\s*%", primary_text).group(1)
         )
@@ -125,8 +127,10 @@ def extract_summary_report(pdf_path):
             1).strip()
         data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)",
                                                                       secondary_text).group(1)
-        data["Existing Secondary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n",
-                                                                secondary_text).group(1).strip()
+        second_heating_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text)
+        data["Existing Secondary Heating Controls"] = (
+            second_heating_controls_match.group(1).strip() if second_heating_controls_match else ""
+        )
         data["Existing Secondary Heating % of Heat"] = int(
             re.search(r"Percentage of Heat\s*(\d+)\s*%", secondary_text).group(1)
         )
@@ -287,7 +291,7 @@ def extract_epr(pdf_path):
             data["Existing Secondary Heating PCDF Reference"] = ""
             data["Existing Secondary Heating Controls"] = ""
             data["Existing Secondary Heating % of Heat"] = 0
-            
+
         else:
             secondary_text = secondary_heating_section.group(1)
 

From a9ce5b68bb6b506b62179c7abac5f43da2498ad1 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 29 Oct 2024 15:11:16 +0000
Subject: [PATCH 33/59] debug extract of main heating code

---
 etl/customers/stonewater/Wave 3 Preparation.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 0af43310..bb100ae1 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -123,8 +123,8 @@ def extract_summary_report(pdf_path):
         secondary_heating_section = re.search(r"Main\s*Heating2\s*(.*?)\s*Water\s*Heating", text, re.DOTALL)
         secondary_text = secondary_heating_section.group(1)
 
-        data["Existing Secondary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", secondary_text).group(
-            1).strip()
+        main_heating_code_match = re.search(r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text)
+        data["Existing Secondary Heating System"] = main_heating_code_match.group(1).strip()
         data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)",
                                                                       secondary_text).group(1)
         second_heating_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text)
@@ -139,7 +139,11 @@ def extract_summary_report(pdf_path):
         secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text)
         water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text)
 
-        data["Secondary Heating Code"] = secondary_heating_code_match.group(1).strip()
+        if data["Existing Secondary Heating System"] == "":
+            data["Secondary Heating Code"] = ""
+        else:
+            data["Secondary Heating Code"] = secondary_heating_code_match.group(1).strip()
+
         data["Water Heating Code"] = water_heating_code_match.group(1).strip()
 
     return data

From 48369ae1505a769339f7adaf713d809e0bfdd208 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 29 Oct 2024 15:18:11 +0000
Subject: [PATCH 34/59] refactor to prioritise epc

---
 .../stonewater/Wave 3 Preparation.py          | 66 +++++++++++++++----
 1 file changed, 54 insertions(+), 12 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index bb100ae1..7f4f81e9 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -123,8 +123,10 @@ def extract_summary_report(pdf_path):
         secondary_heating_section = re.search(r"Main\s*Heating2\s*(.*?)\s*Water\s*Heating", text, re.DOTALL)
         secondary_text = secondary_heating_section.group(1)
 
-        main_heating_code_match = re.search(r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text)
-        data["Existing Secondary Heating System"] = main_heating_code_match.group(1).strip()
+        main_heating_code_match_secondary = re.search(
+            r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text
+        )
+        data["Existing Secondary Heating System"] = main_heating_code_match_secondary.group(1).strip()
         data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)",
                                                                       secondary_text).group(1)
         second_heating_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text)
@@ -299,11 +301,14 @@ def extract_epr(pdf_path):
         else:
             secondary_text = secondary_heating_section.group(1)
 
-            data["Existing Secondary Heating System"] = re.search(
-                r"Main Heating Code\s*(.*?)\n", secondary_text
-            ).group(1).strip()
-            data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)",
-                                                                          secondary_text).group(1)
+            main_heating_code_match_secondary = re.search(
+                r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text
+            )
+            data["Existing Secondary Heating System"] = main_heating_code_match_secondary.group(1).strip()
+
+            data["Existing Secondary Heating PCDF Reference"] = re.search(
+                r"PCDF boiler Reference\s*(\d+)", secondary_text
+            ).group(1)
 
             if data["Existing Secondary Heating System"] == "":
                 data["Existing Secondary Heating Controls"] = ""
@@ -334,20 +339,57 @@ def extract_epr(pdf_path):
     return data
 
 
+def detect_report_type(pdf_path, pdf_file):
+    """
+    Detects the type of report based on content or filename.
+    :param pdf_path: String path to the PDF file
+    :param pdf_file: String name of the PDF file
+    :return: String type of the report ("epr", "summary", or None)
+    """
+    # Attempt to read the first page of the PDF to determine type
+    with open(pdf_path, "rb") as file:
+        reader = PyPDF2.PdfReader(file)
+        first_page_text = reader.pages[0].extract_text() if reader.pages else ""
+
+        if is_energy_report(first_page_text):
+            return "epr"
+        elif "summary" in pdf_file.lower() or is_summary_report(first_page_text):
+            return "summary"
+        elif is_condition_report(first_page_text):
+            return "condition"
+
+    return None
+
+
 def extract_retrofit_pdfs(data_folder_path):
     """
     Handles extraction from a retrofit data folder if it exists and has content.
+    Prioritizes extracting data from an EPR if both EPR and summary report are present.
     """
     retrofit_files = [f for f in os.listdir(data_folder_path) if f.endswith(".pdf")]
+    report_types = {"epr": None, "summary": None}
 
+    # First, identify the types of reports available
     for pdf_file in retrofit_files:
         pdf_path = os.path.join(data_folder_path, pdf_file)
-        extracted = detect_and_parse_report(pdf_path, pdf_file)
-        if extracted is not None:
-            return extracted
-        continue
+        report_type = detect_report_type(pdf_path, pdf_file)
 
-    # If no relevant PDF is found, exit
+        if report_type == "epr":
+            report_types["epr"] = pdf_path
+        elif report_type == "summary":
+            report_types["summary"] = pdf_path
+
+        # Stop checking further if both EPR and summary are found
+        if report_types["epr"] and report_types["summary"]:
+            break
+
+    # Extract data based on report availability and priority
+    if report_types["epr"]:
+        return extract_epr(report_types["epr"])
+    elif report_types["summary"]:
+        return extract_summary_report(report_types["summary"])
+
+    # If no relevant PDF is found, return None
     return None
 
 

From 5af1836aa7731613ed58437586ca7e592a66150a Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 29 Oct 2024 16:32:25 +0000
Subject: [PATCH 35/59] extracting dimensions from epr

---
 .../stonewater/Wave 3 Preparation.py          | 82 ++++++++++++++++++-
 1 file changed, 78 insertions(+), 4 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 7f4f81e9..0b660c76 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -16,6 +16,7 @@ def extract_summary_report(pdf_path):
     - Fuel Bill
     - Address
     """
+    blah
     data = {
         "Address": None,
         "Postcode": None,
@@ -56,8 +57,8 @@ def extract_summary_report(pdf_path):
         data["Number of Storeys"] = int(storeys_match.group(1))
 
         # Extract Carbon Emissions
-        carbon_match = re.search(r"Emissions \(t/year\):\s*([\d.]+)\s*tonnes", text)
-        data["Carbon Emissions (t/year)"] = float(carbon_match.group(1))
+        # carbon_match = re.search(r"Emissions \(t/year\):\s*([\d.]+)\s*tonnes", text)
+        # data["Carbon Emissions (t/year)"] = float(carbon_match.group(1))
 
         # Extract Fuel Bill
         fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text)
@@ -204,6 +205,69 @@ def extract_window_age_description(windows_text):
     }
 
 
+def extract_building_parts_epr(text):
+    """
+    Extracts building parts and associated dimensions from the provided PDF file.
+    Each building part (main and extensions) includes floor area, room height, perimeter, and party wall length.
+    """
+    data = []
+
+    # Pattern to locate each "Building part" section
+    building_part_pattern = re.compile(
+        r"Construction details: Building part: (.*?)\nFloor Area \[m2\] Room Height \[m\] Perimeter \[m\] Party "
+        r"Wall Length \[m\]\n(.*?)(?=Construction details|Data inputs|$)",
+        re.DOTALL
+    )
+
+    # Extract each building part
+    for match in building_part_pattern.finditer(text):
+        part_name = match.group(1).strip()
+        # Clean up building part name to keep only the descriptor (e.g., "Main" or "1st Extension")
+        cleaned_part_name = re.sub(r" - built in.*", "", part_name)
+
+        floor_data = match.group(2)
+
+        # Pattern to match each floor's measurements
+        floor_pattern = re.compile(
+            r"(Lowest floor|First floor)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
+        )
+
+        # Extract floor details for each building part
+        for floor_match in floor_pattern.finditer(floor_data):
+            floor_level = floor_match.group(1)
+            floor_area = float(floor_match.group(2))
+            room_height = float(floor_match.group(3))
+            perimeter = float(floor_match.group(4))
+            party_wall_length = float(floor_match.group(5))
+
+            # Append to data
+            data.append({
+                "Building Part": cleaned_part_name,
+                "Floor Level": floor_level,
+                "Floor Area (m2)": floor_area,
+                "Room Height (m)": room_height,
+                "Perimeter (m)": perimeter,
+                "Party Wall Length (m)": party_wall_length
+            })
+
+    # We now extract out the aggregated data
+
+    main_building = [part for part in data if "Main" in part["Building Part"]]
+    first_extension = [part for part in data if "1st Extension" in part["Building Part"]]
+    dimensions = {
+        "Total Floor Area (m2)": sum([part["Floor Area (m2)"] for part in data]),
+        "Total Ground Floor Area": sum(
+            [part["Floor Area (m2)"] for part in data if "Lowest floor" in part["Floor Level"]]
+        ),
+        "RIR Floor Area": 0,
+        "Main Building Wall Area (m2)": sum([x["Perimeter (m)"] * x["Room Height (m)"] for x in main_building]),
+        "First Extension Wall Area (m2)": sum(
+            [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extension]) if first_extension else 0,
+    }
+
+    return dimensions
+
+
 def extract_epr(pdf_path):
     """
     Extracts specific data from an Energy Report (EPR) PDF file.
@@ -212,6 +276,7 @@ def extract_epr(pdf_path):
         "Address": None,
         "Postcode": None,
         "Current SAP Rating": None,
+        "Primary Energy Use (kWh/yr)": None,
         "Primary Energy Use Intensity (kWh/m2/yr)": None,
         "Number of Storeys": None,
         "Fuel Bill": None,
@@ -232,6 +297,11 @@ def extract_epr(pdf_path):
         "Existing Secondary Heating % of Heat": None,
         "Secondary Heating Code": None,
         "Water Heating Code": None,
+        'Total Floor Area (m2)': None,
+        'Total Ground Floor Area': None,
+        'RIR Floor Area': None,
+        'Main Building Wall Area (m2)': None,
+        'First Extension Wall Area (m2)': None
     }
 
     with open(pdf_path, "rb") as file:
@@ -336,6 +406,9 @@ def extract_epr(pdf_path):
             window_data = extract_window_age_description(windows_text)
             data.update(window_data)
 
+        building_parts = extract_building_parts_epr(text)
+        data.update(building_parts)
+
     return data
 
 
@@ -465,7 +538,7 @@ def main():
                 if summary_data:
                     summary_data = {
                         "survey_folder": survey_folder,
-                        **summary_data
+                        **summary_data,
                     }
                     extracted_data.append(summary_data)
                     continue
@@ -474,11 +547,12 @@ def main():
                 continue
 
         # If no retrofit folder or it was empty, check files in survey_folder
+
         summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path)
         if summary_data:
             summary_data = {
                 "survey_folder": survey_folder,
-                **summary_data
+                **summary_data,
             }
             extracted_data.append(summary_data)
 

From 4e752fb6c48cb163e4350f32eceb14f5a97d2a94 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 29 Oct 2024 17:00:02 +0000
Subject: [PATCH 36/59] added summary table dimension extraction

---
 .../stonewater/Wave 3 Preparation.py          | 82 ++++++++++++++++++-
 1 file changed, 79 insertions(+), 3 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 0b660c76..b660ab64 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -16,7 +16,6 @@ def extract_summary_report(pdf_path):
     - Fuel Bill
     - Address
     """
-    blah
     data = {
         "Address": None,
         "Postcode": None,
@@ -40,6 +39,11 @@ def extract_summary_report(pdf_path):
         "Existing Secondary Heating % of Heat": None,
         "Secondary Heating Code": None,
         "Water Heating Code": None,
+        'Total Floor Area (m2)': None,
+        'Total Ground Floor Area (m2)': None,
+        'RIR Floor Area': None,
+        'Main Building Wall Area (m2)': None,
+        'First Extension Wall Area (m2)': None
     }
 
     with open(pdf_path, "rb") as file:
@@ -149,6 +153,9 @@ def extract_summary_report(pdf_path):
 
         data["Water Heating Code"] = water_heating_code_match.group(1).strip()
 
+        dimensions = extract_building_parts_summary(text)
+        data.update(dimensions)
+
     return data
 
 
@@ -256,7 +263,7 @@ def extract_building_parts_epr(text):
     first_extension = [part for part in data if "1st Extension" in part["Building Part"]]
     dimensions = {
         "Total Floor Area (m2)": sum([part["Floor Area (m2)"] for part in data]),
-        "Total Ground Floor Area": sum(
+        "Total Ground Floor Area (m2)": sum(
             [part["Floor Area (m2)"] for part in data if "Lowest floor" in part["Floor Level"]]
         ),
         "RIR Floor Area": 0,
@@ -268,6 +275,75 @@ def extract_building_parts_epr(text):
     return dimensions
 
 
+def extract_building_parts_summary(text):
+    """
+    Extracts building parts and associated dimensions from the summary report PDF.
+    This includes Main Property and multiple extensions if they exist.
+    """
+    data = []
+
+    # Locate the Dimensions section
+    dimensions_section = re.search(
+        r"Dimensions:\s*Dimension type: Internal\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL
+    )
+    if not dimensions_section:
+        raise ValueError("Failed to locate dimensions section in the text.")
+
+    dimensions_text = dimensions_section.group(1)
+
+    # Pattern to extract each building part, starting from Main Property and including extensions
+    building_part_pattern = re.compile(
+        r"(Main Property|\d+(?:st|nd|rd|th) Extension)\s*"
+        r"(.*?)(?=\d+(?:st|nd|rd|th) Extension|5\.0 Conservatory)",
+        re.DOTALL
+    )
+
+    # Loop through each building part match, including Main Property and extensions
+    for match in building_part_pattern.finditer(dimensions_text):
+        part_name = match.group(1)
+        floor_data = match.group(2)
+
+        # Pattern to extract floor details: Floor Level, Floor Area, Room Height, Perimeter, Party Wall Length
+        floor_pattern = re.compile(
+            r"(1st Floor|Lowest Floor):\s*([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
+        )
+
+        # Extract data for each floor within the building part
+        for floor_match in floor_pattern.finditer(floor_data):
+            floor_level = floor_match.group(1)
+            floor_area = float(floor_match.group(2))
+            room_height = float(floor_match.group(3))
+            perimeter = float(floor_match.group(4))
+            party_wall_length = float(floor_match.group(5))
+
+            # Append to data list
+            data.append({
+                "Building Part": part_name,
+                "Floor Level": floor_level,
+                "Floor Area (m2)": floor_area,
+                "Room Height (m)": room_height,
+                "Perimeter (m)": perimeter,
+                "Party Wall Length (m)": party_wall_length
+            })
+
+    # Calculate aggregated dimensions
+    main_property = [part for part in data if "Main Property" in part["Building Part"]]
+    first_extensions = [part for part in data if "1st Extension" in part["Building Part"]]
+    dimensions = {
+        "Total Floor Area (m2)": sum([part["Floor Area (m2)"] for part in data]),
+        "Total Ground Floor Area (m2)": sum(
+            [part["Floor Area (m2)"] for part in data if "Lowest Floor" in part["Floor Level"]]
+        ),
+        "RIR Floor Area": 0,
+        "Main Building Wall Area (m2)": sum([x["Perimeter (m)"] * x["Room Height (m)"] for x in main_property]),
+        "First Extension Wall Area (m2)": sum(
+            [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extensions]
+        ),
+    }
+
+    return dimensions
+
+
 def extract_epr(pdf_path):
     """
     Extracts specific data from an Energy Report (EPR) PDF file.
@@ -298,7 +374,7 @@ def extract_epr(pdf_path):
         "Secondary Heating Code": None,
         "Water Heating Code": None,
         'Total Floor Area (m2)': None,
-        'Total Ground Floor Area': None,
+        'Total Ground Floor Area (m2)': None,
         'RIR Floor Area': None,
         'Main Building Wall Area (m2)': None,
         'First Extension Wall Area (m2)': None

From a30ad1762a37c81c326412c43cfaa5c91f721ad0 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 29 Oct 2024 17:05:37 +0000
Subject: [PATCH 37/59] handled problem case for summary dimensions

---
 etl/customers/stonewater/Wave 3 Preparation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index b660ab64..1973cbd8 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -294,7 +294,7 @@ def extract_building_parts_summary(text):
     # Pattern to extract each building part, starting from Main Property and including extensions
     building_part_pattern = re.compile(
         r"(Main Property|\d+(?:st|nd|rd|th) Extension)\s*"
-        r"(.*?)(?=\d+(?:st|nd|rd|th) Extension|5\.0 Conservatory)",
+        r"(.*?)(?=\d+(?:st|nd|rd|th) Extension|5\.0 Conservatory|$)",
         re.DOTALL
     )
 

From 98ae672a6160d84e099125904dac390eda1f6fa2 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 29 Oct 2024 17:24:16 +0000
Subject: [PATCH 38/59] debuggin secondary heating code

---
 etl/customers/stonewater/Wave 3 Preparation.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 1973cbd8..84d67f56 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -472,7 +472,8 @@ def extract_epr(pdf_path):
         if data["Existing Secondary Heating System"] == "":
             data["Secondary Heating Code"] = ""
         else:
-            data["Secondary Heating Code"] = secondary_heating_code_match.group(1).strip()
+            data["Secondary Heating Code"] = secondary_heating_code_match.group(
+                1).strip() if secondary_heating_code_match else ""
         data["Water Heating Code"] = water_heating_code_match.group(1).strip()
 
         # Extract Windows information

From d8e8b997a46bf278154cea08444f9b8add3386c5 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 29 Oct 2024 17:31:23 +0000
Subject: [PATCH 39/59] extend to get dimensions from 2nd floor

---
 etl/customers/stonewater/Wave 3 Preparation.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 84d67f56..ad35e2d5 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -149,7 +149,8 @@ def extract_summary_report(pdf_path):
         if data["Existing Secondary Heating System"] == "":
             data["Secondary Heating Code"] = ""
         else:
-            data["Secondary Heating Code"] = secondary_heating_code_match.group(1).strip()
+            data["Secondary Heating Code"] = secondary_heating_code_match.group(
+                1).strip() if secondary_heating_code_match else ""
 
         data["Water Heating Code"] = water_heating_code_match.group(1).strip()
 
@@ -236,7 +237,7 @@ def extract_building_parts_epr(text):
 
         # Pattern to match each floor's measurements
         floor_pattern = re.compile(
-            r"(Lowest floor|First floor)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
+            r"(Lowest floor|First floor|Second floor)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
         )
 
         # Extract floor details for each building part
@@ -305,7 +306,7 @@ def extract_building_parts_summary(text):
 
         # Pattern to extract floor details: Floor Level, Floor Area, Room Height, Perimeter, Party Wall Length
         floor_pattern = re.compile(
-            r"(1st Floor|Lowest Floor):\s*([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
+            r"(1st Floor|Lowest Floor|Second floor):\s*([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
         )
 
         # Extract data for each floor within the building part
@@ -634,6 +635,7 @@ def main():
             extracted_data.append(summary_data)
 
     extracted_data = pd.DataFrame(extracted_data)
+
     # Save this as a csv
     # extracted_data.to_csv("Wave 3 Summary Data - first 200 files.csv", index=False)
 

From c0d896cd59dc3ba003024da9c1caf81737b28d55 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 29 Oct 2024 17:35:57 +0000
Subject: [PATCH 40/59] Debugging secondary heating extraction

---
 etl/customers/stonewater/Wave 3 Preparation.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index ad35e2d5..dc01ef6f 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -460,8 +460,11 @@ def extract_epr(pdf_path):
             if data["Existing Secondary Heating System"] == "":
                 data["Existing Secondary Heating Controls"] = ""
             else:
-                data["Existing Secondary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n",
-                                                                        secondary_text).group(1).strip()
+                # Might not have heating controls on 2nd system
+                secondary_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text)
+                data["Existing Secondary Heating Controls"] = (
+                    secondary_controls_match.group(1).strip() if secondary_controls_match else ""
+                )
             data["Existing Secondary Heating % of Heat"] = int(
                 re.search(r"Percentage of Heat\s*(\d+)\s*%?", secondary_text).group(1)
             )

From 4160ec4dcbae01b438010cc75e0d6eb157d76df2 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 29 Oct 2024 17:52:51 +0000
Subject: [PATCH 41/59] debugging missing secondary heating for summary report,
 completed extraction for files

---
 .../stonewater/Wave 3 Preparation.py          | 44 ++++++++++++-------
 1 file changed, 29 insertions(+), 15 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index dc01ef6f..7bedef29 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -109,7 +109,10 @@ def extract_summary_report(pdf_path):
         # Extract heating system
         # Extract Primary Heating Data
         # Extract Primary Heating Section
-        primary_heating_section = re.search(r"Main\s*Heating1\s*(.*?)\s*Main\s*Heating2", text, re.DOTALL)
+        primary_heating_section1 = re.search(r"Main\s*Heating1\s*(.*?)\s*Main\s*Heating2", text, re.DOTALL)
+        primary_heating_section2 = re.search(r"Main\s*Heating1\s*(.*?)\s*Water\s*Heating", text, re.DOTALL)
+        primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2
+
         primary_text = primary_heating_section.group(1)
 
         data["Existing Primary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", primary_text).group(
@@ -126,21 +129,29 @@ def extract_summary_report(pdf_path):
 
         # Extract Secondary Heating Section
         secondary_heating_section = re.search(r"Main\s*Heating2\s*(.*?)\s*Water\s*Heating", text, re.DOTALL)
-        secondary_text = secondary_heating_section.group(1)
 
-        main_heating_code_match_secondary = re.search(
-            r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text
-        )
-        data["Existing Secondary Heating System"] = main_heating_code_match_secondary.group(1).strip()
-        data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)",
-                                                                      secondary_text).group(1)
-        second_heating_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text)
-        data["Existing Secondary Heating Controls"] = (
-            second_heating_controls_match.group(1).strip() if second_heating_controls_match else ""
-        )
-        data["Existing Secondary Heating % of Heat"] = int(
-            re.search(r"Percentage of Heat\s*(\d+)\s*%", secondary_text).group(1)
-        )
+        if secondary_heating_section is None:
+            data["Existing Secondary Heating System"] = ""
+            data["Existing Secondary Heating PCDF Reference"] = ""
+            data["Existing Secondary Heating Controls"] = ""
+            data["Existing Secondary Heating % of Heat"] = 0
+
+        else:
+            secondary_text = secondary_heating_section.group(1)
+
+            main_heating_code_match_secondary = re.search(
+                r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text
+            )
+            data["Existing Secondary Heating System"] = main_heating_code_match_secondary.group(1).strip()
+            data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)",
+                                                                          secondary_text).group(1)
+            second_heating_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text)
+            data["Existing Secondary Heating Controls"] = (
+                second_heating_controls_match.group(1).strip() if second_heating_controls_match else ""
+            )
+            data["Existing Secondary Heating % of Heat"] = int(
+                re.search(r"Percentage of Heat\s*(\d+)\s*%", secondary_text).group(1)
+            )
 
         # Extract Secondary Heating and Water Heating Codes
         secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text)
@@ -638,6 +649,9 @@ def main():
             extracted_data.append(summary_data)
 
     extracted_data = pd.DataFrame(extracted_data)
+    extracted_data["Primary Energy Use (kWh/yr)"] = (
+        extracted_data["Primary Energy Use Intensity (kWh/m2/yr)"] * extracted_data["Total Floor Area (m2)"]
+    )
 
     # Save this as a csv
     # extracted_data.to_csv("Wave 3 Summary Data - first 200 files.csv", index=False)

From dbee05e555d758d464efe2a43c18d6c3b017cef8 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 29 Oct 2024 18:37:47 +0000
Subject: [PATCH 42/59] working on matching lookup

---
 .../stonewater/Wave 3 Preparation.py          | 48 ++++++++++++++++++-
 .../requirements/requirements-wave-3-prep.txt |  1 +
 2 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 7bedef29..d90360aa 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -5,7 +5,8 @@ import pandas as pd
 from tqdm import tqdm
 from collections import Counter
 
-FILE_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 3 Surveys"
+CUSTOMER_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater"
+FILE_PATH = os.path.join(CUSTOMER_FOLDER_PATH, "Wave 3 Surveys")
 
 
 def extract_summary_report(pdf_path):
@@ -653,6 +654,51 @@ def main():
         extracted_data["Primary Energy Use Intensity (kWh/m2/yr)"] * extracted_data["Total Floor Area (m2)"]
     )
 
+    # We now merge on the coordinator data so that against each property, we can map the measures
+    retrofit_packages_board = pd.read_excel(
+        os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater_SHDF_3_0_Board_work_in_progress_- 22.10.24.xlsx"),
+        header=4
+    )
+    retrofit_packages_board = retrofit_packages_board[~pd.isnull(retrofit_packages_board["Name"])]
+    # We now match this retrofit packages board to the extracted data
+    matching_lookup = []
+    for _, home in retrofit_packages_board.iterrows():
+        filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()]
+        if filtered.empty:
+            print("Check this once we have full data")
+            continue
+
+        if filtered.shape[0] == 1:
+            matching_lookup.append(
+                {
+                    "survey_folder": filtered["survey_folder"].values[0],
+                    "Osm. ID": home["Osm. ID"],
+                    "Name": home["Name"]
+                }
+            )
+            continue
+
+        # home["Name"] should be contained in the survey_folder
+        filtered = filtered[filtered["survey_folder"].str.contains(home["Name"], case=False)]
+        # We have an edge case wher some properties have two outputs in Sharepoint
+        if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ":
+            filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"]
+
+        if filtered.empty:
+            raise Exception("somethign went wrong")
+        if filtered.shape[0] != 1:
+            raise Exception("somethign went wrong2")
+
+        matching_lookup.append(
+            {
+                "survey_folder": filtered["survey_folder"].values[0],
+                "Osm. ID": home["Osm. ID"],
+                "Name": home["Name"]
+            }
+        )
+
+    matching_lookup = pd.DataFrame(matching_lookup)
+
     # Save this as a csv
     # extracted_data.to_csv("Wave 3 Summary Data - first 200 files.csv", index=False)
 
diff --git a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt
index 70bec3cc..97314b32 100644
--- a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt
+++ b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt
@@ -1,3 +1,4 @@
 PyPDF2
 pandas
 tqdm
+openpyxl

From 791262fa866e420cef6a2eced9b4f4ec28897409 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 30 Oct 2024 09:29:11 +0000
Subject: [PATCH 43/59] adding all surveys and updating creation of filepaths

---
 .../stonewater/Wave 3 Preparation.py          | 124 +++++++++++++++++-
 1 file changed, 117 insertions(+), 7 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index d90360aa..fe1faa9d 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -2,11 +2,13 @@ import os
 import PyPDF2
 import re
 import pandas as pd
+import numpy as np
 from tqdm import tqdm
 from collections import Counter
 
 CUSTOMER_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater"
-FILE_PATH = os.path.join(CUSTOMER_FOLDER_PATH, "Wave 3 Surveys")
+SURVEY_FOLDERS = os.path.join(CUSTOMER_FOLDER_PATH, "StonewaterSurveys_{i}")
+NUM_FOLDERS = 14
 
 
 def extract_summary_report(pdf_path):
@@ -610,11 +612,18 @@ def main():
     This code prepares the data for the Warm Homes: Social Housing Fund Wave 3, for Stonewater.
     """
     # List only directories in the specified FILE_PATH
-    survey_folders = [name for name in os.listdir(FILE_PATH) if os.path.isdir(os.path.join(FILE_PATH, name))]
+    survey_folders = []
+
+    # Loop over each survey folder and list its contents
+    for i in range(1, NUM_FOLDERS + 1):
+        folder_path = os.path.join(CUSTOMER_FOLDER_PATH, f"StonewaterSurveys_{i}")
+        if os.path.isdir(folder_path):  # Check if folder exists
+            folder_contents = [os.path.join(f"StonewaterSurveys_{i}", file) for file in os.listdir(folder_path)]
+            survey_folders.extend(folder_contents)  # Append contents to the master list
 
     extracted_data = []
     for survey_folder in tqdm(survey_folders):
-        survey_folder_path = os.path.join(FILE_PATH, survey_folder)
+        survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder)
 
         # List the folders inside of the survey folder
         survey_subfolders = [name for name in os.listdir(survey_folder_path)
@@ -623,9 +632,17 @@ def main():
         # Check if there's a "retrofit assessment" folder
         retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None)
 
+        ra_folder = next(
+            (name for name in survey_subfolders if "ra coordinator info" in name.lower() or "ra info" in name.lower()),
+            None
+        )
+
         # If retrofit assessment folder exists, check if it has content
-        if retrofit_folder:
-            retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder)
+        if retrofit_folder or ra_folder:
+            if retrofit_folder:
+                retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder)
+            else:
+                retrofit_folder_path = os.path.join(survey_folder_path, ra_folder)
             if os.listdir(retrofit_folder_path):  # If not empty
                 summary_data = extract_retrofit_pdfs(data_folder_path=retrofit_folder_path)
                 if summary_data:
@@ -642,6 +659,11 @@ def main():
         # If no retrofit folder or it was empty, check files in survey_folder
 
         summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path)
+        if not summary_data:
+            if len(survey_subfolders) == 1:
+                survey_folder_path = os.path.join(survey_folder_path, survey_subfolders[0])
+                summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path)
+
         if summary_data:
             summary_data = {
                 "survey_folder": survey_folder,
@@ -650,9 +672,14 @@ def main():
             extracted_data.append(summary_data)
 
     extracted_data = pd.DataFrame(extracted_data)
+
+    # What was missed???
+
     extracted_data["Primary Energy Use (kWh/yr)"] = (
         extracted_data["Primary Energy Use Intensity (kWh/m2/yr)"] * extracted_data["Total Floor Area (m2)"]
     )
+    # TODO: Clean up SAP and extract EPC
+    # TODO: RIR floor area!!!
 
     # We now merge on the coordinator data so that against each property, we can map the measures
     retrofit_packages_board = pd.read_excel(
@@ -663,7 +690,13 @@ def main():
     # We now match this retrofit packages board to the extracted data
     matching_lookup = []
     for _, home in retrofit_packages_board.iterrows():
-        filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()]
+        filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()].copy()
+
+        # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces
+        filtered = filtered[filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains(
+            home["Name"].replace(r"[^\w\s]", ""), case=False
+        )]
+
         if filtered.empty:
             print("Check this once we have full data")
             continue
@@ -684,8 +717,12 @@ def main():
         if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ":
             filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"]
 
+        if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB':
+            filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"]
+
         if filtered.empty:
-            raise Exception("somethign went wrong")
+            print("Check this once we have full data2!!!")
+            continue
         if filtered.shape[0] != 1:
             raise Exception("somethign went wrong2")
 
@@ -699,6 +736,79 @@ def main():
 
     matching_lookup = pd.DataFrame(matching_lookup)
 
+    if matching_lookup["Osm. ID"].duplicated().sum():
+        raise Exception("Duplicate Osm. IDs")
+
+    if matching_lookup["survey_folder"].duplicated().sum():
+        raise Exception("Duplicate survey folders")
+
+    measure_columns = [
+        'Main Wall Insulation',
+        'Secondary Wall Insulation',
+        'Loft insulation',
+        'Flat Roof',
+        'Room in Roof',
+        'Window Upgrade',
+        'Door Upgrade',
+        'Ventilation',
+        'Main Heating',
+        'Water Heating',
+        'Heating Controls',
+        'Solar PV',
+        'Other measures'
+    ]
+
+    # We should end up with a 1:1 mapping between the Osm. ID and the survey folder
+    stonewater_data = extracted_data.merge(matching_lookup, on="survey_folder", how="left").merge(
+        retrofit_packages_board[
+            [
+                "Name",
+                "Osm. ID",
+                "Address ID",
+                "Archetype ID",
+                "Arch. Group Rank", "Archetype Representative",
+                "Actual SAP Band",
+                "Actual SAP Rating",
+                "Modelled SAP Band",
+                "Modelled SAP Rating",
+            ] + measure_columns
+            ],
+        on=["Osm. ID", "Name"],
+        how="left"
+    )
+
+    # We've appended the recommended packages and modelled SAP ratings to the data
+    # We also want to append the windows data
+    windows_data = pd.read_excel(
+        os.path.join(
+            CUSTOMER_FOLDER_PATH,
+            "Window data included AP Copy Stonewater SHDF_3_0_Board Triage Master Filtered 26.07.24.xlsx"
+        ),
+        header=12
+    )
+
+    # We get a lookup id of Osm.ID and when the windows were fitted
+    windows_data = windows_data[
+        ["Osm. ID", "Window attributes - Fitted/renewed date", "Parent Asset Window attributes - Fitted/renewed date"]
+    ]
+    # Convert to string for the moment
+    windows_data["Parent Asset Window attributes - Fitted/renewed date"] = windows_data[
+        "Parent Asset Window attributes - Fitted/renewed date"
+    ].astype(str)
+    # Create a single date column
+    windows_data["Fitted/renewed date"] = np.where(
+        pd.notnull(windows_data["Window attributes - Fitted/renewed date"]),
+        windows_data["Window attributes - Fitted/renewed date"],
+        windows_data["Parent Asset Window attributes - Fitted/renewed date"]
+    )
+    # Convert to a date
+    windows_data["Fitted/renewed date"] = pd.to_datetime(windows_data["Fitted/renewed date"])
+    # Calculate the number of years since something was done on the windows
+    windows_data["Years since fitted/renewed"] = (pd.Timestamp.now() - windows_data[
+        "Fitted/renewed date"]).dt.days / 365
+
+    # TODO: Flag if a package includes windows
+
     # Save this as a csv
     # extracted_data.to_csv("Wave 3 Summary Data - first 200 files.csv", index=False)
 

From 8983ebec2fd9ea593f19990f5c02847da4adbc45 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 30 Oct 2024 10:03:10 +0000
Subject: [PATCH 44/59] adding epc band

---
 .../stonewater/Wave 3 Preparation.py          | 59 ++++++++++++++++++-
 1 file changed, 56 insertions(+), 3 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index fe1faa9d..2654fae5 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -11,6 +11,32 @@ SURVEY_FOLDERS = os.path.join(CUSTOMER_FOLDER_PATH, "StonewaterSurveys_{i}")
 NUM_FOLDERS = 14
 
 
+def sap_to_epc(sap_points: int | float):
+    """
+    Simple utility function to convert SAP points to EPC rating.
+    :param sap_points: numerical value of SAP points, typically between 0 and 100
+    :return:
+    """
+
+    if sap_points <= 0:
+        raise ValueError("SAP points should be above 0.")
+
+    if sap_points >= 92:
+        return "A"
+    elif sap_points >= 81:
+        return "B"
+    elif sap_points >= 69:
+        return "C"
+    elif sap_points >= 55:
+        return "D"
+    elif sap_points >= 39:
+        return "E"
+    elif sap_points >= 21:
+        return "F"
+    else:
+        return "G"
+
+
 def extract_summary_report(pdf_path):
     """
     Extracts specific data from the provided PDF file.
@@ -23,6 +49,7 @@ def extract_summary_report(pdf_path):
         "Address": None,
         "Postcode": None,
         "Current SAP Rating": None,
+        "Current EPC Band": None,
         "Fuel Bill": None,
         "Number of Storeys": None,
         "Window Age Description": None,
@@ -57,7 +84,7 @@ def extract_summary_report(pdf_path):
 
         # Extract Current SAP rating
         sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text)
-        data["Current SAP Rating"] = sap_match.group(1)
+        data["Current SAP Rating"] = sap_match.group(1).split(" ")[1]
 
         # Number of storeys
         storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text)
@@ -367,6 +394,7 @@ def extract_epr(pdf_path):
         "Address": None,
         "Postcode": None,
         "Current SAP Rating": None,
+        "Current EPC Band": None,
         "Primary Energy Use (kWh/yr)": None,
         "Primary Energy Use Intensity (kWh/m2/yr)": None,
         "Number of Storeys": None,
@@ -621,6 +649,9 @@ def main():
             folder_contents = [os.path.join(f"StonewaterSurveys_{i}", file) for file in os.listdir(folder_path)]
             survey_folders.extend(folder_contents)  # Append contents to the master list
 
+    # Get rid of .DS_Store files
+    survey_folders = [folder for folder in survey_folders if not folder.endswith(".DS_Store")]
+
     extracted_data = []
     for survey_folder in tqdm(survey_folders):
         survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder)
@@ -643,6 +674,16 @@ def main():
                 retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder)
             else:
                 retrofit_folder_path = os.path.join(survey_folder_path, ra_folder)
+
+            # Check if everything inside is a sub-folder and the number of folders is 2
+            items = [item for item in os.listdir(retrofit_folder_path) if item != '.DS_Store']
+            all_folders = [os.path.isdir(os.path.join(retrofit_folder_path, item)) for item in items]
+            if all(all_folders) and len(all_folders) == 2 and "Property Pics" in items:
+                # Get the folder that isn't Property Pics
+                retrofit_folder_path = os.path.join(
+                    retrofit_folder_path, [item for item in items if item != "Property Pics"][0]
+                )
+
             if os.listdir(retrofit_folder_path):  # If not empty
                 summary_data = extract_retrofit_pdfs(data_folder_path=retrofit_folder_path)
                 if summary_data:
@@ -673,14 +714,24 @@ def main():
 
     extracted_data = pd.DataFrame(extracted_data)
 
-    # What was missed???
-
     extracted_data["Primary Energy Use (kWh/yr)"] = (
         extracted_data["Primary Energy Use Intensity (kWh/m2/yr)"] * extracted_data["Total Floor Area (m2)"]
     )
+    extracted_data["Current SAP Rating"] = extracted_data["Current SAP Rating"].astype(int)
+    extracted_data["Current EPC Band"] = extracted_data["Current SAP Rating"].apply(sap_to_epc)
+
     # TODO: Clean up SAP and extract EPC
     # TODO: RIR floor area!!!
 
+    # Remove some definite duplicates
+    extracted_data = extracted_data[
+        ~extracted_data["survey_folder"].isin(
+            [
+                "StonewaterSurveys_10/4 Beech Road, LUTON, LU1 1DP ROSS",
+            ]
+        )
+    ]
+
     # We now merge on the coordinator data so that against each property, we can map the measures
     retrofit_packages_board = pd.read_excel(
         os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater_SHDF_3_0_Board_work_in_progress_- 22.10.24.xlsx"),
@@ -715,9 +766,11 @@ def main():
         filtered = filtered[filtered["survey_folder"].str.contains(home["Name"], case=False)]
         # We have an edge case wher some properties have two outputs in Sharepoint
         if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ":
+            bl1h2
             filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"]
 
         if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB':
+            blah1
             filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"]
 
         if filtered.empty:

From cb9399a704bcf2605429bc18704c0ff2b413d406 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 30 Oct 2024 10:22:23 +0000
Subject: [PATCH 45/59] investigating missings'

---
 .../stonewater/Wave 3 Preparation.py          | 40 ++++++++++++++-----
 1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 2654fae5..53279eed 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -720,15 +720,22 @@ def main():
     extracted_data["Current SAP Rating"] = extracted_data["Current SAP Rating"].astype(int)
     extracted_data["Current EPC Band"] = extracted_data["Current SAP Rating"].apply(sap_to_epc)
 
-    # TODO: Clean up SAP and extract EPC
     # TODO: RIR floor area!!!
 
     # Remove some definite duplicates
+    dupes = extracted_data[extracted_data["Address"].duplicated()]["Address"]
+    dupes = extracted_data[extracted_data["Address"].isin(dupes)]
+    dupes = dupes.sort_values("Address")
+    # Get all of the folders that end with ROSS
+    to_drop = dupes[dupes["survey_folder"].str.endswith("ROSS")]["survey_folder"].unique().tolist()
+
     extracted_data = extracted_data[
         ~extracted_data["survey_folder"].isin(
             [
                 "StonewaterSurveys_10/4 Beech Road, LUTON, LU1 1DP ROSS",
-            ]
+                "StonewaterSurveys_2/135 Runley Road, LUTON, LU1 1TX ROSS",
+                "StonewaterSurveys_13/7 Saxon Road, LUTON, LU3 1JR ROSS"
+            ] + to_drop
         )
     ]
 
@@ -740,8 +747,15 @@ def main():
     retrofit_packages_board = retrofit_packages_board[~pd.isnull(retrofit_packages_board["Name"])]
     # We now match this retrofit packages board to the extracted data
     matching_lookup = []
-    for _, home in retrofit_packages_board.iterrows():
-        filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()].copy()
+    for _, home in tqdm(retrofit_packages_board.iterrows(), total=len(retrofit_packages_board)):
+
+        # Handle the case that has the wrong postcode in the asset data
+        if home["Name"] == "Flat 21 Walmer Street":
+            filtered = extracted_data[
+                extracted_data["survey_folder"] == "StonewaterSurveys_14/91-1-Flat 21 Walmer Street-HR4 9JD"
+                ].copy()
+        else:
+            filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()].copy()
 
         # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces
         filtered = filtered[filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains(
@@ -749,7 +763,6 @@ def main():
         )]
 
         if filtered.empty:
-            print("Check this once we have full data")
             continue
 
         if filtered.shape[0] == 1:
@@ -766,18 +779,20 @@ def main():
         filtered = filtered[filtered["survey_folder"].str.contains(home["Name"], case=False)]
         # We have an edge case wher some properties have two outputs in Sharepoint
         if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ":
-            bl1h2
-            filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"]
+            raise Exception("Fix me1")
+            # filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"]
 
         if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB':
-            blah1
-            filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"]
+            raise Exception("Fix me2")
+            # filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"]
+
+        if home["Name"] == '2 Bromyard Road' and home["Postcode"] == 'WR15 8BZ':
+            filtered = filtered[filtered["survey_folder"] == "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ"]
 
         if filtered.empty:
-            print("Check this once we have full data2!!!")
             continue
         if filtered.shape[0] != 1:
-            raise Exception("somethign went wrong2")
+            raise Exception("something went wrong")
 
         matching_lookup.append(
             {
@@ -788,6 +803,9 @@ def main():
         )
 
     matching_lookup = pd.DataFrame(matching_lookup)
+    # Find Osmosis IDs that are in the packages board but not in the matching looking
+    # missing_osm_ids = set(retrofit_packages_board["Osm. ID"]) - set(matching_lookup["Osm. ID"])
+    # missing_osm_ids = list(missing_osm_ids)
 
     if matching_lookup["Osm. ID"].duplicated().sum():
         raise Exception("Duplicate Osm. IDs")

From 51c2d04a6d0d919a07edac2d34e868a59c755b2d Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 30 Oct 2024 11:42:40 +0000
Subject: [PATCH 46/59] fixing missed matches

---
 .../stonewater/Wave 3 Preparation.py          | 80 ++++++++++++++-----
 1 file changed, 59 insertions(+), 21 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 53279eed..5e444ca8 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -741,26 +741,53 @@ def main():
 
     # We now merge on the coordinator data so that against each property, we can map the measures
     retrofit_packages_board = pd.read_excel(
-        os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater_SHDF_3_0_Board_work_in_progress_- 22.10.24.xlsx"),
+        os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater 3.0 Updated SAP Pre & Modelled 29.10.24.xlsx"),
         header=4
     )
     retrofit_packages_board = retrofit_packages_board[~pd.isnull(retrofit_packages_board["Name"])]
+    # Take just the rows that have been surveyed
+    retrofit_packages_board = retrofit_packages_board[
+        retrofit_packages_board["RA"].isin(["Invoiced", "Completed"])
+    ]
+
+    # Replace \n with ""
+    extracted_data["Postcode"] = extracted_data["Postcode"].str.replace("\n", "")
+
+    manual_filters = {
+        "Flat 21 Walmer Street": "StonewaterSurveys_14/91-1-Flat 21 Walmer Street-HR4 9JD",
+        "6 Cornewall Close": "StonewaterSurveys_14/aa 6, Cornewall Close, Moccas, HEREFORD, HR2 9LG",
+        "1 Cluny Way": "12-1-1 Cluny Way-SG15 6ZB",
+        "2 Bromyard Road": "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ",
+        'Flat 18, 1 Raglan Court': "StonewaterSurveys_13/60-3-18 Raglan Court, 1 Raglan Court-MK41 8QT",
+        '14 Raglan Court, 1 Devizes Avenue': 'StonewaterSurveys_12/55-3-14 Raglan Court, Devizes Avenue-MK41 8QT',
+        '19 South Road': 'StonewaterSurveys_4/19 The Oaks, South Road, SMETHWICK, B67 7BY',
+        'Flat 12 Pelican Lane': 'StonewaterSurveys_1/121-3-Flat 12 Lynton Court, Pelican Lane-RG14 1NN',
+        'Flat C, 44 St Leonards Avenue': 'StonewaterSurveys_11/427-2-44c St. Leonards Avenue-MK42 0RB',
+    }
+
     # We now match this retrofit packages board to the extracted data
     matching_lookup = []
     for _, home in tqdm(retrofit_packages_board.iterrows(), total=len(retrofit_packages_board)):
-
         # Handle the case that has the wrong postcode in the asset data
-        if home["Name"] == "Flat 21 Walmer Street":
-            filtered = extracted_data[
-                extracted_data["survey_folder"] == "StonewaterSurveys_14/91-1-Flat 21 Walmer Street-HR4 9JD"
-                ].copy()
+        if home["Address ID"] == 6111566:
+            blah
+        # 6118117, 6118744, 6117091
+        if home["Name"] in manual_filters:
+            filtered = extracted_data[extracted_data["survey_folder"] == manual_filters[home["Name"]]].copy()
         else:
             filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()].copy()
+            filtered["survey_folder"].values
 
-        # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces
-        filtered = filtered[filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains(
-            home["Name"].replace(r"[^\w\s]", ""), case=False
-        )]
+            # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces
+            to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains(
+                home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False
+            )
+            if to_filter.sum() == 0:
+                to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.replace(",", "").str.replace(".",
+                                                                                                                   "").str.contains(
+                    home["Name"].replace(r"[^\w\s]", "").replace(",", ""), case=False
+                )
+            filtered = filtered[to_filter]
 
         if filtered.empty:
             continue
@@ -769,7 +796,7 @@ def main():
             matching_lookup.append(
                 {
                     "survey_folder": filtered["survey_folder"].values[0],
-                    "Osm. ID": home["Osm. ID"],
+                    "Address ID": home["Address ID"],
                     "Name": home["Name"]
                 }
             )
@@ -797,15 +824,23 @@ def main():
         matching_lookup.append(
             {
                 "survey_folder": filtered["survey_folder"].values[0],
-                "Osm. ID": home["Osm. ID"],
+                "Address ID": home["Address ID"],
                 "Name": home["Name"]
             }
         )
 
     matching_lookup = pd.DataFrame(matching_lookup)
     # Find Osmosis IDs that are in the packages board but not in the matching looking
-    # missing_osm_ids = set(retrofit_packages_board["Osm. ID"]) - set(matching_lookup["Osm. ID"])
-    # missing_osm_ids = list(missing_osm_ids)
+    missing_ids = set(retrofit_packages_board["Address ID"]) - set(matching_lookup["Address ID"])
+    missing_ids = list(missing_ids)
+    print(len(missing_ids))
+    if missing_ids:
+        # We check that the missing ids have no data yet
+        missing_data = retrofit_packages_board[retrofit_packages_board["Address ID"].isin(missing_ids)][
+            ["Name", "Address ID", "Archetype ID"]]
+        extracted_data[extracted_data["survey_folder"].str.contains("23 Monmouth")]["survey_folder"].values
+
+        matching_lookup[matching_lookup["survey_folder"].str.contains("23 Monmouth")]
 
     if matching_lookup["Osm. ID"].duplicated().sum():
         raise Exception("Duplicate Osm. IDs")
@@ -834,7 +869,6 @@ def main():
         retrofit_packages_board[
             [
                 "Name",
-                "Osm. ID",
                 "Address ID",
                 "Archetype ID",
                 "Arch. Group Rank", "Archetype Representative",
@@ -848,6 +882,14 @@ def main():
         how="left"
     )
 
+    # Create a section for costs
+    for measure in measure_columns:
+        stonewater_data[f"Cost of {measure}"] = None
+
+    stonewater_data["Total Cost of Measures"] = None
+    stonewater_data["Contingency Cost"] = None
+    stonewater_data["Total Cost of Measures inc Contingency"] = None
+
     # We've appended the recommended packages and modelled SAP ratings to the data
     # We also want to append the windows data
     windows_data = pd.read_excel(
@@ -878,12 +920,8 @@ def main():
     windows_data["Years since fitted/renewed"] = (pd.Timestamp.now() - windows_data[
         "Fitted/renewed date"]).dt.days / 365
 
-    # TODO: Flag if a package includes windows
-
-    # Save this as a csv
-    # extracted_data.to_csv("Wave 3 Summary Data - first 200 files.csv", index=False)
-
-    missed = [f for f in survey_folders if f not in extracted_data["survey_folder"].tolist()]
+    stonewater_data["Package Includes Windows"] = ~pd.isnull(stonewater_data["Window Upgrade"])
+    stonewater_data = stonewater_data.merge(windows_data, on="Osm. ID", how="left")
 
 # if __name__ == "__main__":
 #     main()

From 90c9466421b5cb187c9355d0a8c005f379650ece Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 30 Oct 2024 13:46:43 +0000
Subject: [PATCH 47/59] sorted dupes

---
 .../stonewater/Wave 3 Preparation.py          | 44 ++++++++++++-------
 1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 5e444ca8..67362865 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -756,27 +756,34 @@ def main():
     manual_filters = {
         "Flat 21 Walmer Street": "StonewaterSurveys_14/91-1-Flat 21 Walmer Street-HR4 9JD",
         "6 Cornewall Close": "StonewaterSurveys_14/aa 6, Cornewall Close, Moccas, HEREFORD, HR2 9LG",
-        "1 Cluny Way": "12-1-1 Cluny Way-SG15 6ZB",
         "2 Bromyard Road": "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ",
         'Flat 18, 1 Raglan Court': "StonewaterSurveys_13/60-3-18 Raglan Court, 1 Raglan Court-MK41 8QT",
         '14 Raglan Court, 1 Devizes Avenue': 'StonewaterSurveys_12/55-3-14 Raglan Court, Devizes Avenue-MK41 8QT',
         '19 South Road': 'StonewaterSurveys_4/19 The Oaks, South Road, SMETHWICK, B67 7BY',
         'Flat 12 Pelican Lane': 'StonewaterSurveys_1/121-3-Flat 12 Lynton Court, Pelican Lane-RG14 1NN',
         'Flat C, 44 St Leonards Avenue': 'StonewaterSurveys_11/427-2-44c St. Leonards Avenue-MK42 0RB',
+        '16 The Crescent, Kington': 'StonewaterSurveys_9/360-3-16 The Crescent-HR5 3AS',
+        '2 School Lane, Leominster': 'StonewaterSurveys_5/224-1-2 School Lane-HR6 8AA',
+        '14 South Road': 'StonewaterSurveys_2/14 The Oaks, South Road, SMETHWICK, B67 7BY',
+        '1 Groves Street': 'StonewaterSurveys_4/19-5-1 Groves Street-SN2 2BW',
+        # '2 Sorrell Place': '',
+        # '72 St Ives Road': '',
+        # '1 The Close, Burton Gardens': '',
+        # '102 Cheaton Close': '',
+        # 'Flat 16 Spring Gardens': '',
+        # '4 Apple Close': '',
+        '25 Folly Lane': '',
+
     }
 
     # We now match this retrofit packages board to the extracted data
     matching_lookup = []
     for _, home in tqdm(retrofit_packages_board.iterrows(), total=len(retrofit_packages_board)):
         # Handle the case that has the wrong postcode in the asset data
-        if home["Address ID"] == 6111566:
-            blah
-        # 6118117, 6118744, 6117091
         if home["Name"] in manual_filters:
             filtered = extracted_data[extracted_data["survey_folder"] == manual_filters[home["Name"]]].copy()
         else:
             filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()].copy()
-            filtered["survey_folder"].values
 
             # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces
             to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains(
@@ -836,14 +843,11 @@ def main():
     print(len(missing_ids))
     if missing_ids:
         # We check that the missing ids have no data yet
-        missing_data = retrofit_packages_board[retrofit_packages_board["Address ID"].isin(missing_ids)][
-            ["Name", "Address ID", "Archetype ID"]]
-        extracted_data[extracted_data["survey_folder"].str.contains("23 Monmouth")]["survey_folder"].values
+        if len(missing_ids) != 8:
+            raise Exception("Unacceptable number of missings")
 
-        matching_lookup[matching_lookup["survey_folder"].str.contains("23 Monmouth")]
-
-    if matching_lookup["Osm. ID"].duplicated().sum():
-        raise Exception("Duplicate Osm. IDs")
+    if matching_lookup["Address ID"].duplicated().sum():
+        raise Exception("Duplicate Address IDs")
 
     if matching_lookup["survey_folder"].duplicated().sum():
         raise Exception("Duplicate survey folders")
@@ -865,20 +869,21 @@ def main():
     ]
 
     # We should end up with a 1:1 mapping between the Osm. ID and the survey folder
-    stonewater_data = extracted_data.merge(matching_lookup, on="survey_folder", how="left").merge(
+    stonewater_data = extracted_data.merge(matching_lookup, on="survey_folder", how="inner").merge(
         retrofit_packages_board[
             [
                 "Name",
+                "RA",
                 "Address ID",
                 "Archetype ID",
-                "Arch. Group Rank", "Archetype Representative",
+                "Arch. Group Rank",
                 "Actual SAP Band",
                 "Actual SAP Rating",
                 "Modelled SAP Band",
                 "Modelled SAP Rating",
             ] + measure_columns
             ],
-        on=["Osm. ID", "Name"],
+        on=["Address ID", "Name"],
         how="left"
     )
 
@@ -900,9 +905,13 @@ def main():
         header=12
     )
 
+    windows_data = windows_data[windows_data["Address ID"] != "Address ID"]
+    windows_data = windows_data[~pd.isnull(windows_data["Address ID"])]
+
     # We get a lookup id of Osm.ID and when the windows were fitted
     windows_data = windows_data[
-        ["Osm. ID", "Window attributes - Fitted/renewed date", "Parent Asset Window attributes - Fitted/renewed date"]
+        ["Address ID", "Window attributes - Fitted/renewed date",
+         "Parent Asset Window attributes - Fitted/renewed date"]
     ]
     # Convert to string for the moment
     windows_data["Parent Asset Window attributes - Fitted/renewed date"] = windows_data[
@@ -921,7 +930,8 @@ def main():
         "Fitted/renewed date"]).dt.days / 365
 
     stonewater_data["Package Includes Windows"] = ~pd.isnull(stonewater_data["Window Upgrade"])
-    stonewater_data = stonewater_data.merge(windows_data, on="Osm. ID", how="left")
+    windows_data["Address ID"] = windows_data["Address ID"].astype(float)
+    stonewater_data = stonewater_data.merge(windows_data, on="Address ID", how="left")
 
 # if __name__ == "__main__":
 #     main()

From fba5b2b3cbe786dd7d16b1380fe59f9ff6447206 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 30 Oct 2024 13:58:36 +0000
Subject: [PATCH 48/59] added RIR detection to summary report

---
 .../stonewater/Wave 3 Preparation.py          | 34 ++++++++++++++++---
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 67362865..6cf26df8 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -320,7 +320,7 @@ def extract_building_parts_epr(text):
 def extract_building_parts_summary(text):
     """
     Extracts building parts and associated dimensions from the summary report PDF.
-    This includes Main Property and multiple extensions if they exist.
+    This includes Main Property, multiple extensions if they exist, and Room in Roof areas.
     """
     data = []
 
@@ -368,6 +368,20 @@ def extract_building_parts_summary(text):
                 "Party Wall Length (m)": party_wall_length
             })
 
+        # Check specifically for "Room(s) in Roof" entries, which only have Floor Area
+        room_in_roof_pattern = re.compile(r"Room\(s\) in Roof:\s*([\d.]+)")
+        room_in_roof_match = room_in_roof_pattern.search(floor_data)
+        if room_in_roof_match:
+            floor_area = float(room_in_roof_match.group(1))
+            data.append({
+                "Building Part": part_name,
+                "Floor Level": "Room in Roof",
+                "Floor Area (m2)": floor_area,
+                "Room Height (m)": None,  # Placeholder for missing data
+                "Perimeter (m)": None,  # Placeholder for missing data
+                "Party Wall Length (m)": None  # Placeholder for missing data
+            })
+
     # Calculate aggregated dimensions
     main_property = [part for part in data if "Main Property" in part["Building Part"]]
     first_extensions = [part for part in data if "1st Extension" in part["Building Part"]]
@@ -376,10 +390,14 @@ def extract_building_parts_summary(text):
         "Total Ground Floor Area (m2)": sum(
             [part["Floor Area (m2)"] for part in data if "Lowest Floor" in part["Floor Level"]]
         ),
-        "RIR Floor Area": 0,
-        "Main Building Wall Area (m2)": sum([x["Perimeter (m)"] * x["Room Height (m)"] for x in main_property]),
+        "RIR Floor Area": sum(
+            [part["Floor Area (m2)"] for part in data if "Room in Roof" in part["Floor Level"]]
+        ),
+        "Main Building Wall Area (m2)": sum([x["Perimeter (m)"] * x["Room Height (m)"] for x in main_property if
+                                             x["Perimeter (m)"] and x["Room Height (m)"]]),
         "First Extension Wall Area (m2)": sum(
-            [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extensions]
+            [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extensions if
+             x["Perimeter (m)"] and x["Room Height (m)"]]
         ),
     }
 
@@ -887,6 +905,9 @@ def main():
         how="left"
     )
 
+    if stonewater_data["Address ID"].duplicated().sum():
+        raise Exception("Duplicate Address IDs")
+
     # Create a section for costs
     for measure in measure_columns:
         stonewater_data[f"Cost of {measure}"] = None
@@ -933,5 +954,10 @@ def main():
     windows_data["Address ID"] = windows_data["Address ID"].astype(float)
     stonewater_data = stonewater_data.merge(windows_data, on="Address ID", how="left")
 
+    if stonewater_data["Address ID"].duplicated().sum():
+        raise Exception("Duplicate Address IDs")
+
+    # stonewater_data[~pd.isnull(stonewater_data["Room in Roof"])]["survey_folder"].values
+
 # if __name__ == "__main__":
 #     main()

From d0cf88af6498d73a1155af320e5d6b899e3f94fa Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 30 Oct 2024 14:09:42 +0000
Subject: [PATCH 49/59] added RIR area search for epr

---
 .../stonewater/Wave 3 Preparation.py          | 43 ++++++++++++++-----
 1 file changed, 33 insertions(+), 10 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 6cf26df8..ee5cd1ca 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -256,8 +256,9 @@ def extract_window_age_description(windows_text):
 
 def extract_building_parts_epr(text):
     """
-    Extracts building parts and associated dimensions from the provided PDF file.
+    Extracts building parts and associated dimensions from the provided PDF text.
     Each building part (main and extensions) includes floor area, room height, perimeter, and party wall length.
+    Handles cases where 'Room(s) in Roof area' appears within the part_name with only the Floor Area information.
     """
     data = []
 
@@ -271,12 +272,28 @@ def extract_building_parts_epr(text):
     # Extract each building part
     for match in building_part_pattern.finditer(text):
         part_name = match.group(1).strip()
-        # Clean up building part name to keep only the descriptor (e.g., "Main" or "1st Extension")
-        cleaned_part_name = re.sub(r" - built in.*", "", part_name)
-
         floor_data = match.group(2)
 
-        # Pattern to match each floor's measurements
+        # Check for "Room(s) in Roof area" within the part_name
+        room_in_roof_match = re.search(r"Room\(s\) in Roof area:\s*([\d.]+)", part_name)
+        if room_in_roof_match:
+            # Extract Room in Roof area and add it as a separate entry
+            floor_area = float(room_in_roof_match.group(1))
+            # Clean up part name to exclude "Room(s) in Roof area" from the building part name
+            cleaned_part_name = re.sub(r" - built in.*|Room\(s\) in Roof area:.*", "", part_name).strip()
+            data.append({
+                "Building Part": cleaned_part_name,
+                "Floor Level": "Room in Roof",
+                "Floor Area (m2)": floor_area,
+                "Room Height (m)": None,  # Placeholder for missing data
+                "Perimeter (m)": None,  # Placeholder for missing data
+                "Party Wall Length (m)": None  # Placeholder for missing data
+            })
+        else:
+            # Clean up part name to keep only the descriptor (e.g., "Main" or "1st Extension")
+            cleaned_part_name = re.sub(r" - built in.*", "", part_name).strip()
+
+        # Pattern to match each floor's measurements in standard cases
         floor_pattern = re.compile(
             r"(Lowest floor|First floor|Second floor)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
         )
@@ -299,8 +316,7 @@ def extract_building_parts_epr(text):
                 "Party Wall Length (m)": party_wall_length
             })
 
-    # We now extract out the aggregated data
-
+    # Aggregated data calculation
     main_building = [part for part in data if "Main" in part["Building Part"]]
     first_extension = [part for part in data if "1st Extension" in part["Building Part"]]
     dimensions = {
@@ -308,10 +324,17 @@ def extract_building_parts_epr(text):
         "Total Ground Floor Area (m2)": sum(
             [part["Floor Area (m2)"] for part in data if "Lowest floor" in part["Floor Level"]]
         ),
-        "RIR Floor Area": 0,
-        "Main Building Wall Area (m2)": sum([x["Perimeter (m)"] * x["Room Height (m)"] for x in main_building]),
+        "RIR Floor Area": sum(
+            [part["Floor Area (m2)"] for part in data if "Room in Roof" in part["Floor Level"]]
+        ),
+        "Main Building Wall Area (m2)": sum(
+            [x["Perimeter (m)"] * x["Room Height (m)"] for x in main_building if
+             x["Perimeter (m)"] and x["Room Height (m)"]]
+        ),
         "First Extension Wall Area (m2)": sum(
-            [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extension]) if first_extension else 0,
+            [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extension if
+             x["Perimeter (m)"] and x["Room Height (m)"]]
+        ) if first_extension else 0,
     }
 
     return dimensions

From f97bb7f1273b349abd77f75ff09152af87506f4e Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 30 Oct 2024 14:14:40 +0000
Subject: [PATCH 50/59] extract lighting fittings from epr

---
 etl/customers/stonewater/Wave 3 Preparation.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index ee5cd1ca..16970803 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -461,7 +461,10 @@ def extract_epr(pdf_path):
         'Total Ground Floor Area (m2)': None,
         'RIR Floor Area': None,
         'Main Building Wall Area (m2)': None,
-        'First Extension Wall Area (m2)': None
+        'First Extension Wall Area (m2)': None,
+        "Number of Light Fittings": None,
+        "Number of LEL Fittings": None,
+        "Number of fittings needing LEL": None
     }
 
     with open(pdf_path, "rb") as file:
@@ -573,6 +576,13 @@ def extract_epr(pdf_path):
         building_parts = extract_building_parts_epr(text)
         data.update(building_parts)
 
+        # Get number of lighting outlets and number of fittings needing LEL
+        lighting_fittings_match = re.search(r"Total number of light fittings\s*(\d+)", text)
+        data["Number of Light Fittings"] = int(lighting_fittings_match.group(1))
+        lel_fittings_match = re.search(r"Total number of L.E.L. fittings\s*(\d+)", text)
+        data["Number of LEL Fittings"] = int(lel_fittings_match.group(1))
+        data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"]
+
     return data
 
 

From bccf3c621bbec73ac35a18f123ba73b456c695df Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 30 Oct 2024 14:17:20 +0000
Subject: [PATCH 51/59] lighting fitting extraction from summary report

---
 etl/customers/stonewater/Wave 3 Preparation.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 16970803..ccd062e2 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -73,7 +73,10 @@ def extract_summary_report(pdf_path):
         'Total Ground Floor Area (m2)': None,
         'RIR Floor Area': None,
         'Main Building Wall Area (m2)': None,
-        'First Extension Wall Area (m2)': None
+        'First Extension Wall Area (m2)': None,
+        "Number of Light Fittings": None,
+        "Number of LEL Fittings": None,
+        "Number of fittings needing LEL": None
     }
 
     with open(pdf_path, "rb") as file:
@@ -198,6 +201,10 @@ def extract_summary_report(pdf_path):
         dimensions = extract_building_parts_summary(text)
         data.update(dimensions)
 
+        data["Number of Light Fittings"] = int(re.search(r"Total number of light fittings\s*(\d+)", text).group(1))
+        data["Number of LEL Fittings"] = int(re.search(r"Total number of L.E.L. fittings\s*(\d+)", text).group(1))
+        data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"]
+
     return data
 
 
@@ -771,8 +778,6 @@ def main():
     extracted_data["Current SAP Rating"] = extracted_data["Current SAP Rating"].astype(int)
     extracted_data["Current EPC Band"] = extracted_data["Current SAP Rating"].apply(sap_to_epc)
 
-    # TODO: RIR floor area!!!
-
     # Remove some definite duplicates
     dupes = extracted_data[extracted_data["Address"].duplicated()]["Address"]
     dupes = extracted_data[extracted_data["Address"].isin(dupes)]

From 7e26fb4b86eee0c5f0ab3bd4e562796d44c5d0a7 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 30 Oct 2024 20:30:05 +0000
Subject: [PATCH 52/59] working on proposed sample for stonewater

---
 .../stonewater/Wave 3 Preparation.py          | 203 +++++++++++++++++-
 1 file changed, 201 insertions(+), 2 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index ccd062e2..bfdc8beb 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -486,7 +486,7 @@ def extract_epr(pdf_path):
         data["Postcode"] = data["Address"].split(",")[-1].strip()
 
         # Extract Current and Potential SAP ratings
-        sap_match = re.search(r"GG \(1-20\)(\d{1,2})(\d{1,2})", text)
+        sap_match = re.search(r"GG \(1-20\)\s*(\d{1,2})\s*(\d{1,2})", text)
         current_sap, _ = int(sap_match.group(1)), int(sap_match.group(2))
         data["Current SAP Rating"] = current_sap
 
@@ -896,7 +896,6 @@ def main():
     # Find Osmosis IDs that are in the packages board but not in the matching looking
     missing_ids = set(retrofit_packages_board["Address ID"]) - set(matching_lookup["Address ID"])
     missing_ids = list(missing_ids)
-    print(len(missing_ids))
     if missing_ids:
         # We check that the missing ids have no data yet
         if len(missing_ids) != 8:
@@ -937,6 +936,7 @@ def main():
                 "Actual SAP Rating",
                 "Modelled SAP Band",
                 "Modelled SAP Rating",
+                "Package Ref",
             ] + measure_columns
             ],
         on=["Address ID", "Name"],
@@ -995,7 +995,206 @@ def main():
     if stonewater_data["Address ID"].duplicated().sum():
         raise Exception("Duplicate Address IDs")
 
+    # Save this data to excel
+    stonewater_data.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages.xlsx", index=False)
+
+    cost_sheet = [
+        {
+            "measure": "EWI 0.30 w.m2.K", "cost": 298.35, "unit": "m2"
+        },
+        {
+            "measure": "CWI RdSAP Default", "cost": 14.21, "unit": "m2"
+        },
+        {
+            "measure": "Poss Extract CWI & Refill (issues identified)", "cost": 14.21 + 25, "unit": "m2"
+        },
+        {
+            "measure": "IWI 0.30 w.m2.K", "cost": 244.80, "unit": "m2"
+        },
+        {
+            "measure": "EWI/IWI 0.3", "cost": (298.35 + 244.8) / 2, "unit": "m2"
+        },
+        {
+            "measure": "Loft Insulation 0.11 w.m2.K", "cost": 16.07, "unit": "m2"
+        },
+        {
+            "measure": "Flat Roof 0.11 w.m2.K", "cost": 195, "unit": "m2"
+        },
+        {
+            "measure": "DG Window 1.30 w.m2.K", "cost": 1140, "unit": "each"
+        },
+        {
+            "measure": "Secondary 2.40", "cost": 974, "unit": "each"
+        },
+        {
+            "measure": "Ins. Door 1.30 w.m2.K", "cost": None, "unit": "each"
+        },
+        {
+            "measure": "Ins. Door 1.40 w.m2.K", "cost": None, "unit": "each"
+        },
+        {
+            "measure": "DMEV", "cost": 900, "unit": "each"
+        },
+        {
+            "measure": "ASHP Vaillant 102607 5kw", "cost": None, "unit": "each"
+        },
+        {
+            "measure": "HHRSH Quantum 150", "cost": None, "unit": "each"
+        },
+        {
+            "measure": "Dual Stat Tank 210lt 50mm Foam", "cost": None, "unit": "each"
+        },
+        {
+            "measure": "Dual Stat Tank 160lt 50mm Foam", "cost": None, "unit": "each"
+        },
+        {
+            "measure": "Dual Stat Tank 110lt 50mm Foam", "cost": None, "unit": "each"
+        },
+        {
+            "measure": "Smart Thermostat", "cost": 1200, "unit": "each"
+        },
+        {
+            "measure": "TRV's", "cost": 350, "unit": "each"
+        },
+        {
+            "measure": "Solar PV - 3.0kwp", "cost": 4365.0, "unit": "each"
+        },
+        {
+            "measure": "Solar PV - 1.5kwp", "cost": 3881, "unit": "each"
+        },
+        {
+            "measure": "LEL", "cost": 35, "unit": "per bulb"
+        },
+        {
+            "measure": "Roof 0.16 - Walls 0.30", "cost": 180, "unit": "floor area m2"
+        },
+        {
+            "measure": "Roof 0.16 - Walls 0.16", "cost": 180, "unit": "floor area m2"
+        },
+    ]
+    cost_sheet = pd.DataFrame(cost_sheet)
+
+    # Save cost sheet - ideally this will be used as a secondary sheet for Stonewater
+    cost_sheet.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - cost sheet.xlsx", index=False)
+
+    stonewater_data["Room in Roof"].value_counts()
+
     # stonewater_data[~pd.isnull(stonewater_data["Room in Roof"])]["survey_folder"].values
 
+    create_proposed_wave_3_bid(
+        costed_packages_filepath=os.path.join(
+            CUSTOMER_FOLDER_PATH, "Stonewater - Costed Retrofit Packages 20241030 (WIP).xlsx"
+        ),
+        archetypes_sheet_filepath=os.path.join(
+            CUSTOMER_FOLDER_PATH, "Stonewater SHDF_3_0_Board Triage 22.05.24 - Archetyped V3.1.xlsx"
+        )
+    )
+
+
+def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepath):
+    # We read in the costed packages
+    costed_packages = pd.read_excel(costed_packages_filepath)
+
+    archetypes_to_cost = costed_packages[
+        [
+            "Name", "Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Modelled SAP Band",
+            "Modelled SAP Rating", 'Total Cost of Measures', 'Contingency Cost',
+            'Total Cost of Measures inc Contingency'
+        ]
+    ].copy()
+
+    # We take properties that are EPC D and below (61% of units)
+    archetypes_to_cost = archetypes_to_cost[archetypes_to_cost["Current EPC Band"].isin(["D", "E", "F", "G"])]
+
+    archetypes_to_cost["Has been modelled"] = ~pd.isnull(archetypes_to_cost["Modelled SAP Band"])
+
+    average_cost = archetypes_to_cost[
+        archetypes_to_cost["Has been modelled"]
+    ]['Total Cost of Measures inc Contingency'].mean()
+    print(average_cost)
+
+    # These are the Arhetypes that will likely be suitable for Wave 3
+    archetypes_sheet = pd.read_excel(archetypes_sheet_filepath, header=4)
+    archetypes_sheet = archetypes_sheet[~pd.isnull(archetypes_sheet["Address ID"])]
+    archetypes_sheet = archetypes_sheet[archetypes_sheet["Address ID"] != "Address ID"]
+    archetypes_sheet["Address ID"] = archetypes_sheet["Address ID"].astype(int)
+
+    # We merge the property details onto the costed archetypes
+    archetypes_to_cost = archetypes_to_cost.merge(
+        archetypes_sheet[["Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]],
+        on="Address ID",
+        how="left"
+    )
+
+    proposed_sample = archetypes_sheet[archetypes_sheet["Archetype ID"].isin(archetypes_to_cost["Archetype ID"])]
+
+    proposed_sample = proposed_sample[
+        [
+            "Name", "Postcode", "UPRN", "UDPRN", "Address ID", "Osm. ID", "Archetype ID",
+            "Property Type", "Wall Type", "Roof Type", "Heating"
+        ]
+    ]
+
+    # We classify into high and low confidence
+
+    match_classification = []
+    for _, home in tqdm(proposed_sample.iterrows(), total=len(proposed_sample)):
+        surveyed = archetypes_to_cost[archetypes_to_cost["Archetype ID"] == home["Archetype ID"]]
+        # We now check if we have a perfect match
+        surveyed = surveyed[
+            (surveyed["Property Type"] == home["Property Type"]) &
+            (surveyed["Wall Type"] == home["Wall Type"]) &
+            (surveyed["Roof Type"] == home["Roof Type"]) &
+            (surveyed["Heating"] == home["Heating"])
+            ]
+
+        if surveyed.empty:
+            match_classification.append(
+                {
+                    "Address ID": home["Address ID"],
+                    "Match to Surveyed": "Approximate"
+                }
+            )
+            continue
+        match_classification.append(
+            {
+                "Address ID": home["Address ID"],
+                "Match to Surveyed": "Exact"
+            }
+        )
+
+    match_classification = pd.DataFrame(match_classification)
+
+    proposed_sample = proposed_sample.merge(
+        match_classification,
+        on="Address ID",
+        how="left",
+    )
+
+    # Merge on the cost per archetype
+    cost_per_archetype = (
+        archetypes_to_cost.groupby("Archetype ID")[['Total Cost of Measures inc Contingency']].mean().reset_index()
+    )
+    proposed_sample = proposed_sample.merge(
+        cost_per_archetype,
+        on="Archetype ID",
+        how="left"
+    )
+
+    # We add on a boolean to indicate if a property from that archetype has been modelled
+    proposed_sample = proposed_sample.merge(
+        archetypes_to_cost.groupby("Archetype ID")[["Has been modelled"]].any().reset_index(),
+        on="Archetype ID",
+        how="left"
+    )
+
+    proposed_sample["Total Cost of Measures inc Contingency"] = np.where(
+        ~proposed_sample["Has been modelled"],
+        None, proposed_sample["Total Cost of Measures inc Contingency"]
+    )
+
+    # Save excel
+    proposed_sample.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - Proposed Wave 3 Bid (WIP).xlsx", index=False)
+
 # if __name__ == "__main__":
 #     main()

From a9ea89d2ae5253453e227c83c067f8a248d3f893 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 31 Oct 2024 12:03:17 +0000
Subject: [PATCH 53/59] done with stonewater for now

---
 .../stonewater/Wave 3 Preparation.py          | 144 ++++++++++++++++--
 1 file changed, 133 insertions(+), 11 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index bfdc8beb..477a73c8 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -76,10 +76,13 @@ def extract_summary_report(pdf_path):
         'First Extension Wall Area (m2)': None,
         "Number of Light Fittings": None,
         "Number of LEL Fittings": None,
-        "Number of fittings needing LEL": None
+        "Number of fittings needing LEL": None,
+        "Main Roof Type": None,
+        "Main Roof Insulation": None,
+        "Main Roof Insulation Thickness": None,
     }
 
-    with open(pdf_path, "rb") as file:
+    with (open(pdf_path, "rb") as file):
         reader = PyPDF2.PdfReader(file)
         text = ""
         for page in reader.pages:
@@ -205,6 +208,27 @@ def extract_summary_report(pdf_path):
         data["Number of LEL Fittings"] = int(re.search(r"Total number of L.E.L. fittings\s*(\d+)", text).group(1))
         data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"]
 
+        roof_section = re.search(r"8\.0 Roofs:\n(.*?)\n9\.0 Floors:", text, re.DOTALL)
+        roof_text = roof_section.group(1).strip()
+        roof_type_match = re.search(r"Type\s*([A-Za-z0-9\s]+)", roof_text)
+        data["Main Roof Type"] = roof_type_match.group(1).strip() if roof_type_match else None
+
+        # Check if "Insulation" exists between Type and Insulation Thickness
+        insulation_search = re.search(
+            r"Type\s+.*?\n(Insulation\s+(.*?)\n)?(Insulation Thickness\s+(.*?)\n)", roof_text, re.DOTALL
+        )
+
+        if insulation_search:
+            # Insulation match will be present if it exists, otherwise it will be None
+            insulation_match = insulation_search.group(2)  # Optional group for Insulation
+            insulation_thickness_match = insulation_search.group(4)  # Required group for Insulation Thickness
+
+            # Populate insulation fields
+            data["Main Roof Insulation"] = insulation_match.strip() if insulation_match else None
+            data["Main Roof Insulation Thickness"] = (
+                insulation_thickness_match.strip() if insulation_thickness_match else None
+            )
+
     return data
 
 
@@ -434,6 +458,49 @@ def extract_building_parts_summary(text):
     return dimensions
 
 
+import re
+
+
+def extract_roof_details_epr(text):
+    """
+    Extracts roof type, insulation, and insulation thickness for each building part
+    in the provided EPR PDF text.
+    """
+    # Define data structure to hold results
+    roof_data = []
+
+    # Locate each building part section
+    building_part_pattern = re.compile(
+        r"Construction details: Building part: (.*?)\n(.*?)(?=Conservatory|Construction details|$)",
+        re.DOTALL
+    )
+
+    # Extract each building part's data, including roof details
+    for match in building_part_pattern.finditer(text):
+        part_name = match.group(1).strip()
+
+        # Clean up the building part name
+        cleaned_part_name = re.sub(r" - built in.*|Room\(s\) in Roof area:.*", "", part_name).strip()
+
+        part_details = match.group(2)
+
+        # Extract Roof Type, Roof Insulation, and Roof Insulation Thickness
+        roof_type_match = re.search(r"Roof Type:\s*(.*?)(?=\n|$)", part_details)
+        roof_insulation_match = re.search(r"Roof Insulation:\s*(.*?)(?=\n|$)", part_details)
+        roof_insulation_thickness_match = re.search(r"Roof Insulation Thickness:\s*(.*?)(?=\n|$)", part_details)
+
+        # Store results for this building part
+        roof_data.append({
+            "Building Part": cleaned_part_name,
+            "Roof Type": roof_type_match.group(1).strip() if roof_type_match else None,
+            "Roof Insulation": roof_insulation_match.group(1).strip() if roof_insulation_match else None,
+            "Roof Insulation Thickness": roof_insulation_thickness_match.group(
+                1).strip() if roof_insulation_thickness_match else None,
+        })
+
+    return roof_data
+
+
 def extract_epr(pdf_path):
     """
     Extracts specific data from an Energy Report (EPR) PDF file.
@@ -471,7 +538,10 @@ def extract_epr(pdf_path):
         'First Extension Wall Area (m2)': None,
         "Number of Light Fittings": None,
         "Number of LEL Fittings": None,
-        "Number of fittings needing LEL": None
+        "Number of fittings needing LEL": None,
+        "Main Roof Type": None,
+        "Main Roof Insulation": None,
+        "Main Roof Insulation Thickness": None,
     }
 
     with open(pdf_path, "rb") as file:
@@ -590,6 +660,13 @@ def extract_epr(pdf_path):
         data["Number of LEL Fittings"] = int(lel_fittings_match.group(1))
         data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"]
 
+        roof_details = extract_roof_details_epr(text)
+        # Get from the main building
+        main_roof_details = [r for r in roof_details if "Main" in r["Building Part"]]
+        data["Main Roof Type"] = main_roof_details[0]["Roof Type"]
+        data["Main Roof Insulation"] = main_roof_details[0]["Roof Insulation"]
+        data["Main Roof Insulation Thickness"] = main_roof_details[0]["Roof Insulation Thickness"]
+
     return data
 
 
@@ -1077,13 +1154,11 @@ def main():
     # Save cost sheet - ideally this will be used as a secondary sheet for Stonewater
     cost_sheet.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - cost sheet.xlsx", index=False)
 
-    stonewater_data["Room in Roof"].value_counts()
-
     # stonewater_data[~pd.isnull(stonewater_data["Room in Roof"])]["survey_folder"].values
 
     create_proposed_wave_3_bid(
         costed_packages_filepath=os.path.join(
-            CUSTOMER_FOLDER_PATH, "Stonewater - Costed Retrofit Packages 20241030 (WIP).xlsx"
+            CUSTOMER_FOLDER_PATH, "Stonewater - Costed Retrofit Packages 20241030 (WIP) V2.xlsx"
         ),
         archetypes_sheet_filepath=os.path.join(
             CUSTOMER_FOLDER_PATH, "Stonewater SHDF_3_0_Board Triage 22.05.24 - Archetyped V3.1.xlsx"
@@ -1098,11 +1173,30 @@ def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepa
     archetypes_to_cost = costed_packages[
         [
             "Name", "Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Modelled SAP Band",
-            "Modelled SAP Rating", 'Total Cost of Measures', 'Contingency Cost',
-            'Total Cost of Measures inc Contingency'
+            "Modelled SAP Rating", "Package Ref", 'Total Cost of Measures', 'Contingency Cost',
+            'Total Cost of Measures inc Contingency', 'Main Roof Type', 'Main Roof Insulation',
+            'Main Roof Insulation Thickness', 'Existing Primary Heating System',
+            'Existing Primary Heating PCDF Reference'
         ]
     ].copy()
 
+    # Combine 'Main Roof Type', 'Main Roof Insulation', 'Main Roof Insulation Thickness', separating by colons!
+    archetypes_to_cost['Surveyed Main Roof'] = (
+        archetypes_to_cost['Main Roof Type'] + ': ' + archetypes_to_cost['Main Roof Insulation'] + ': ' +
+        archetypes_to_cost['Main Roof Insulation Thickness'].astype(str)
+    )
+
+    # Combine the heating systems, separating by colons!
+    archetypes_to_cost['Surveyed Main Heating'] = (
+        archetypes_to_cost['Existing Primary Heating System'] + ': code - ' + archetypes_to_cost[
+        'Existing Primary Heating PCDF Reference'].astype(str)
+    )
+
+    archetypes_to_cost = archetypes_to_cost.drop(
+        columns=['Main Roof Type', 'Main Roof Insulation', 'Main Roof Insulation Thickness',
+                 'Existing Primary Heating System',
+                 'Existing Primary Heating PCDF Reference'])
+
     # We take properties that are EPC D and below (61% of units)
     archetypes_to_cost = archetypes_to_cost[archetypes_to_cost["Current EPC Band"].isin(["D", "E", "F", "G"])]
 
@@ -1139,7 +1233,19 @@ def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepa
 
     match_classification = []
     for _, home in tqdm(proposed_sample.iterrows(), total=len(proposed_sample)):
-        surveyed = archetypes_to_cost[archetypes_to_cost["Archetype ID"] == home["Archetype ID"]]
+
+        surveyed = archetypes_to_cost[archetypes_to_cost["Archetype ID"] == home["Archetype ID"]].copy()
+        surveyed["Package Ref"] = surveyed["Package Ref"].astype(str)
+
+        package = " or ".join(sorted([x for x in surveyed["Package Ref"].unique() if x.strip()]))
+        package = package.replace("\n", "")
+
+        surveyed_roofs = " or ".join(sorted([x for x in surveyed["Surveyed Main Roof"].unique() if x.strip()]))
+        surveyed_roofs = surveyed_roofs.replace("\n", "")
+
+        surveyed_heating = " or ".join(sorted([x for x in surveyed["Surveyed Main Heating"].unique() if x.strip()]))
+        surveyed_heating = surveyed_heating.replace("\n", "")
+
         # We now check if we have a perfect match
         surveyed = surveyed[
             (surveyed["Property Type"] == home["Property Type"]) &
@@ -1149,17 +1255,33 @@ def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepa
             ]
 
         if surveyed.empty:
+            if package == "2B2A":
+                raise Exception("Fix me")
             match_classification.append(
                 {
                     "Address ID": home["Address ID"],
-                    "Match to Surveyed": "Approximate"
+                    "Match to Surveyed": "Approximate",
+                    "Proposed Package Ref": package,
+                    "Surveyed Archetype Roofs": surveyed_roofs,
+                    "Surveyed Archetype Heating": surveyed_heating
                 }
             )
             continue
+        # Re-do
+        package = " or ".join(sorted([x for x in surveyed["Package Ref"].unique() if x.strip()]))
+        package = package.replace("\n", "")
+        surveyed_roofs = " or ".join(sorted([x for x in surveyed["Surveyed Main Roof"].unique() if x.strip()]))
+        surveyed_roofs = surveyed_roofs.replace("\n", "")
+        surveyed_heating = " or ".join(sorted([x for x in surveyed["Surveyed Main Heating"].unique() if x.strip()]))
+        surveyed_heating = surveyed_heating.replace("\n", "")
+
         match_classification.append(
             {
                 "Address ID": home["Address ID"],
-                "Match to Surveyed": "Exact"
+                "Match to Surveyed": "Exact",
+                "Proposed Package Ref": package,
+                "Surveyed Archetype Roofs": surveyed_roofs,
+                "Surveyed Archetype Heating": surveyed_heating
             }
         )
 

From 6cf0db87f7a3fc68db02d518f9e57bc28b3fe0c1 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 31 Oct 2024 14:35:14 +0000
Subject: [PATCH 54/59] completed packages for first 12 surveys

---
 .idea/Model.iml                      |   2 +-
 .idea/misc.xml                       |   2 +-
 etl/customers/aiha/xml_extraction.py | 139 ++++++++++++++++-----------
 3 files changed, 85 insertions(+), 58 deletions(-)

diff --git a/.idea/Model.iml b/.idea/Model.iml
index 762580d9..df6c4faa 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -7,7 +7,7 @@
       <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
       <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
     </content>
-    <orderEntry type="jdk" jdkName="Stonewater-wave-3" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Fastapi-backend" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
   <component name="PyNamespacePackagesService">
diff --git a/.idea/misc.xml b/.idea/misc.xml
index c916a158..50cad4ca 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -3,7 +3,7 @@
   <component name="Black">
     <option name="sdkName" value="Python 3.10 (backend)" />
   </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="Stonewater-wave-3" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Fastapi-backend" project-jdk-type="Python SDK" />
   <component name="PyCharmProfessionalAdvertiser">
     <option name="shown" value="true" />
   </component>
diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py
index 038e8593..65e0eb1e 100644
--- a/etl/customers/aiha/xml_extraction.py
+++ b/etl/customers/aiha/xml_extraction.py
@@ -9,6 +9,32 @@ SURVEY_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIH
 CONTINGENCY_RATE = 0.26
 
 
+def sap_to_epc(sap_points: int | float):
+    """
+    Simple utility function to convert SAP points to EPC rating.
+    :param sap_points: numerical value of SAP points, typically between 0 and 100
+    :return:
+    """
+
+    if sap_points <= 0:
+        raise ValueError("SAP points should be above 0.")
+
+    if sap_points >= 92:
+        return "A"
+    elif sap_points >= 81:
+        return "B"
+    elif sap_points >= 69:
+        return "C"
+    elif sap_points >= 55:
+        return "D"
+    elif sap_points >= 39:
+        return "E"
+    elif sap_points >= 21:
+        return "F"
+    else:
+        return "G"
+
+
 def main():
     """
     This script handles the extraction of data from the XML files in the survey folders.
@@ -76,24 +102,14 @@ def main():
     # TODO
     #   - AIH001-03 has a loft that is inaccessible - ask Chenai about why this property didn't have access to the loft
     #     [Can't remember, not clear - Chenai will check]
-    #   - AIH001-03 instead of cylinder insulation, we could install an air source heat pump but it might not be the
-    #     best option for this property due to it being extrememly large and the walls being uninsulated. It might not
-    #     be performant enough in the winter, when COP will be more like 1.5.
-    #   - AIH001-03 - can add additional 1.6kWp solar PV to flat roof to get close to EPC C. How many occupants are
-    #     in the property? Does it make sense to have such a large solar PV system (5.6kWp)?
     #   - AIH001-04 why couldn't the cylinder be accessed? - treating this could get to the EPC C
     #       - Potential measure - search for the cylinder and insulate it
     #   - AIH001-08 and AIH001-09, check if it's freehold - could solar work as both of these units are part of the same
     #     buulding [Question for Lewis & Kevin]
     #   - AIH001-09 - Is it not possible to install a loft hatch? [IT IS NOT, NO ACCESS - would need to accessed from
     #   the other unit]
-    #   - AIH001-09 - Why is there assumed secondary heating? [Question for Lewis & Kevin]
     #   - AIH001-09 - Is there definitely an immersion water heater? Is this definitely the case for the other units?
-    #       [Question for Lewis & Kevin]
-    #   - AIH001-11 - The layout of this unit is confusing, is there roof access? [NO!!!! - It's a Sun room!!]
-    #   - AIH001-12 - Why was there not access to the cylinder? [Sealed shut]
-    #   - AIH001-12 - Is the need to draught proofing due to the windows? [This would be addressed by deailing with the
-    #                 windows]
+    #       [Question for Lewis & Kevin] - [YES - ASHP!!!!]
 
     recommended_measures = [
         {
@@ -114,40 +130,32 @@ def main():
                 },
                 {
                     "measure": "Solar PV",
-                    "description": "5.6kWp Solar PV system",
+                    "description": "4kWp Solar PV system",
                     "config": [
                         {
                             "size": "4kWp",
                             "orientation": "East",
                             "elavation": 30,
-                            "overshading": "Modest",
+                            "overshading": "None or little",
                         },
-                        {
-                            "size": "1.6kWp",
-                            "orientation": "Horizontal",
-                            "elavation": "Horizontal",
-                            "overshading": "Modest",
-                        }
                     ],
-                    "sap_points": 7,
-                    "ending_sap": 53
+                    "sap_points": 10,
+                    "ending_sap": 54
                 },
                 {
-                    "measure": "Loft Insulation",
-                    "description": "300mm loft insulation",
-                    "floor_area": 80,  # Based on area of 1st floor
-                    "sap_points": 8,
-                    "ending_sap": 61
+                    "measure": "Air Source Heat Pump",
+                    "description": "Ecoforest ecoAIR EVI 4-20 20kW air source heat pump (+TTZC)",
+                    "sap_points": 20,
+                    "ending_sap": 74
                 },
                 {
-                    "measure": "TTZC",
-                    "description": "Smart Thermostat",
-                    "sap_points": 3,
-                    "ending_sap": 64
+                    "measure": "Tariff Review",
+                    "description": "Switch to 24-hour tariff",
+                    "sap_points": 15,
+                    "ending_sap": 89
                 }
             ],
-            "notes": "There was no access to the loft for this property and so a loft hatch would need to be "
-                     "installed..."
+            "notes": "Unclear if the loft is accessible"
         },
         {
             "survey_key": "AIH001-04",
@@ -174,14 +182,14 @@ def main():
                             "size": "4kWp",
                             "orientation": "South",
                             "elavation": 30,
-                            "overshading": "Modest",
+                            "overshading": "None or little",
                         }
                     ],
-                    "sap_points": 12,
-                    "ending_sap": 67
+                    "sap_points": 15,
+                    "ending_sap": 70
                 }
             ],
-            "notes": ""
+            "notes": "Roof is flat, PV array should be installed south facing with elevation"
         },
         {
             "survey_key": "AIH001-05",
@@ -276,7 +284,7 @@ def main():
                     "measure": "Internal Wall Insulation",
                     "description": "100mm internal wall insulation",
                     "hlp": 24.13 * 2.63,
-                    "sap_points": 5,
+                    "sap_points": 7,
                     "ending_sap": 69,
                 },
                 {
@@ -316,8 +324,32 @@ def main():
                     "description": "Smart Thermostat",
                     "sap_points": 3,
                     "ending_sap": 56,
+                },
+                {
+                    "measure": "Solar PV",
+                    "description": "1.6kWp Solar PV system",
+                    "config": [
+                        {
+                            "size": "1.6W",
+                            "orientation": "South-East",
+                            "elavation": 30,
+                            "overshading": "None or little",
+                        }
+                    ],
+                    "sap_points": 6,
+                    "ending_sap": 62
+                },
+                {
+                    "measure": "Loft Insulation",
+                    "description": "300mm loft insulation",
+                    "floor_area": 63.59 + 12.31,  # Based on area of main building and 1st extension
+                    "sap_points": 8,
+                    "ending_sap": 70,
                 }
-            ]
+            ],
+            "notes": "This property is a house split into 2 flats. We can install a PV array for both units (one array"
+                     "per unit). Area on south-east part of roof is ~22m2 with no overshadowing. Flat roof area is 8m2"
+                     "with modest overshadowing. We suggest a 3.2kWp system, across two units"
         },
         {
             "survey_key": "AIH001-11",
@@ -353,14 +385,7 @@ def main():
                     "description": "Installation of double glazing",
                     "n_windows": 20,  # Counted the bay windows each as 3
                     "windows_area": 10.66,
-                    "sap_points": 2,
-                    "ending_sap": 48,
-                },
-                {
-                    "measure": "Draught Proofing",
-                    "description": "Window draught proofing improvements",
-                    "n_windows": 20,  # Counted the bay windows each as 3
-                    "sap_points": 1,
+                    "sap_points": 3,
                     "ending_sap": 49,
                 },
                 {
@@ -379,7 +404,7 @@ def main():
                 },
                 {
                     "measure": "Air Source Heat Pump",
-                    "description": "Ecoforest ecoAIR EVI 4-20 20kW air source heat pump",
+                    "description": "Ecoforest ecoAIR EVI 4-20 20kW air source heat pump (+TTZC)",
                     "sap_points": 15,
                     "ending_sap": 73
                 },
@@ -497,17 +522,19 @@ def main():
         {'item': 'Window draught proofing improvements', 'unit_price': 63, 'unit': 'window'},
         {'item': '100mm flat roof insulation', 'unit_price': 195, 'unit': 'floor_m2'},
         {'item': 'Switch to 24-hour tariff', 'unit_price': 0, 'unit': None},
-        {'item': '3.2kWp Solar PV system', 'unit_price': 3686, 'unit': 'unit_needs_scaffolding'},
-        {'item': '5.6kWp Solar PV system', 'unit_price': 5015, 'unit': 'unit_needs_scaffolding'},
         {'item': 'Installation of double glazing', 'unit_price': 1074, 'unit': 'window'},
-        {'item': 'Ecoforest ecoAIR EVI 4-20 20kW air source heat pump', 'unit_price': 21189, 'unit': 'unit'},
-        {'item': '2kWp Solar PV system', 'unit_price': 3201, 'unit': 'unit_needs_scaffolding'},
+        {'item': 'Ecoforest ecoAIR EVI 4-20 20kW air source heat pump (+TTZC)', 'unit_price': 21189 + 1200,
+         'unit': 'unit'},
         {'item': '100mm+ RIR insulation on all surfaces (ceiling u=0.16, walls u=0.3)', 'unit_price': 244.80,
          'unit': 'floor_m2'},
-        {'item': '4kWp Solar PV system', 'unit_price': 4009, 'unit': 'unit_needs_scaffolding'},
         {'item': '300mm loft insulation', 'unit_price': 16.07, 'unit': 'floor_m2'},
         {'item': 'Smart Thermostat', 'unit_price': 1200, 'unit': 'unit'},
         {'item': '2x DMEV fans', 'unit_price': 1070, 'unit': 'unit'},
+        {'item': '1.6kWp Solar PV system', 'unit_price': 3040, 'unit': 'unit_needs_scaffolding'},
+        {'item': '2kWp Solar PV system', 'unit_price': 3201, 'unit': 'unit_needs_scaffolding'},
+        {'item': '3.2kWp Solar PV system', 'unit_price': 3686, 'unit': 'unit_needs_scaffolding'},
+        {'item': '4kWp Solar PV system', 'unit_price': 4009, 'unit': 'unit_needs_scaffolding'},
+        {'item': '5.6kWp Solar PV system', 'unit_price': 5015, 'unit': 'unit_needs_scaffolding'},
     ]
     pricing_data = pd.DataFrame(pricing_data)
 
@@ -587,13 +614,13 @@ def main():
     result_df = pd.merge(pivoted_measures, starting_sap_df, on="survey_key", how="left")
 
     # Step 5: Calculate the ending SAP.
-    result_df["ending_sap"] = result_df["starting_sap"] + result_df["total_sap_points"]
+    result_df["Ending SAP"] = result_df["starting_sap"] + result_df["total_sap_points"]
+    result_df["Ending EPC Rating"] = result_df["Ending SAP"].apply(sap_to_epc)
 
     # Step 6: Merge the result with the measures_data to get the final DataFrame.
     final_measures = measures_data.merge(
         result_df, how="left", on="survey_key"
     )
 
-
-if __name__ == "__main__":
-    main()
+# if __name__ == "__main__":
+#     main()

From 8f8993ab6480f30cbefe0ec8d6295005ba12dc6f Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 31 Oct 2024 15:31:09 +0000
Subject: [PATCH 55/59] added some additional aiha packages

---
 etl/customers/aiha/xml_extraction.py | 78 ++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)

diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py
index 65e0eb1e..25917f1e 100644
--- a/etl/customers/aiha/xml_extraction.py
+++ b/etl/customers/aiha/xml_extraction.py
@@ -111,6 +111,9 @@ def main():
     #   - AIH001-09 - Is there definitely an immersion water heater? Is this definitely the case for the other units?
     #       [Question for Lewis & Kevin] - [YES - ASHP!!!!]
 
+    # TODO: Need AIH001-02 9C Clapton Common
+    # TODO: Check which properties are in a conservation area
+
     recommended_measures = [
         {
             "survey_key": "AIH001-01",
@@ -501,6 +504,81 @@ def main():
                 }
             ]
         },
+        {
+            "survey_key": "AIH001-15",
+            "starting_sap": 60,
+            "recommended_measures": [
+                {
+                    "measure": "Loft Insulation",
+                    "description": "300mm loft insulation",
+                    "floor_area": 73.81,  # Based on area of main building
+                    "sap_points": 1,
+                    "ending_sap": 61,
+                },
+                {
+                    "measure": "TTZC",
+                    "description": "Smart Thermostat",
+                    "sap_points": 3,
+                    "ending_sap": 64,
+                },
+                {
+                    "measure": "Solar PV",
+                    "description": "3.2kWp Solar PV system",
+                    "config": [
+                        {
+                            "size": "3.2W",
+                            "orientation": "North-West",
+                            "elavation": 30,
+                            "overshading": "None or little",
+                        }
+                    ],
+                    "sap_points": 7,
+                    "ending_sap": 71,
+                    "notes": "The array is North-west facing and therefore will be slightly less efficient than south"
+                             "facing, however the impact is not so severe as to make the installation not worthwhile."
+                             "Ground mounted"
+                }
+            ]
+        },
+        {
+            "survey_key": "AIH001-16",
+            "starting_sap": 60,
+            "recommended_measures": [
+                {
+                    "measure": "Cavity Wall Insulation",
+                    "description": "CWI to rdSAP default standard",
+                    "hlp": (21.56 * 2.60) + (26.79 * 2.8) + (6.74 * 2.60),
+                    "sap_points": 4,
+                    "ending_sap": 64,
+                },
+                {
+                    "measure": "Ventilation",
+                    "description": "2x DMEV fans",
+                    "sap_points": 0,
+                    "ending_sap": 64,
+                },
+                {
+                    "measure": "Loft Insulation",
+                    "description": "300mm loft insulation",
+                    "sap_points": 1,
+                    "ending_sap": 65,
+                },
+                {
+                    "measure": "Solar PV",
+                    "description": "2.4kWp Solar PV system",
+                    "config": [
+                        {
+                            "size": "2.4W",
+                            "orientation": "South-East",
+                            "elavation": 30,
+                            "overshading": "Modest",
+                        }
+                    ],
+                    "sap_points": 5,
+                    "ending_sap": 70,
+                }
+            ]
+        }
     ]
 
     scaffolding_data = [

From b6cf10287b5867aa20a00123ee8c4de3e590e4a0 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 1 Nov 2024 07:20:55 +0000
Subject: [PATCH 56/59] added AIH001-17

---
 etl/customers/aiha/xml_extraction.py          | 38 +++++++++++++++++++
 .../stonewater/Wave 3 Preparation.py          |  8 ++--
 2 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py
index 25917f1e..8c5c9008 100644
--- a/etl/customers/aiha/xml_extraction.py
+++ b/etl/customers/aiha/xml_extraction.py
@@ -578,6 +578,44 @@ def main():
                     "ending_sap": 70,
                 }
             ]
+        },
+        {
+            "survey_key": "AIH001-17",
+            "starting_sap": 62,
+            "recommended_measures": [
+                {
+                    "measure": "Cylinder Insulation",
+                    "description": "80mm cylinder insulation",
+                    "sap_points": 1,
+                    "ending_sap": 63,
+                },
+                {
+                    "measure": "TTZC",
+                    "description": "Smart Thermostat",
+                    "sap_points": 3,
+                    "ending_sap": 66,
+                },
+                {
+                    "measure": "Solar PV",
+                    "description": "4kWp Solar PV system",
+                    "config": [
+                        {
+                            "size": "3.2kW",
+                            "orientation": "East",
+                            "elavation": 30,
+                            "overshading": "None or little",
+                        },
+                        {
+                            "size": "0.8kW",
+                            "orientation": "West",
+                            "elavation": 30,
+                            "overshading": "None or little",
+                        }
+                    ],
+                    "sap_points": 12,
+                    "ending_sap": 78,
+                }
+            ]
         }
     ]
 
diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 477a73c8..9f929db1 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -458,9 +458,6 @@ def extract_building_parts_summary(text):
     return dimensions
 
 
-import re
-
-
 def extract_roof_details_epr(text):
     """
     Extracts roof type, insulation, and insulation thickness for each building part
@@ -1158,7 +1155,7 @@ def main():
 
     create_proposed_wave_3_bid(
         costed_packages_filepath=os.path.join(
-            CUSTOMER_FOLDER_PATH, "Stonewater - Costed Retrofit Packages 20241030 (WIP) V2.xlsx"
+            CUSTOMER_FOLDER_PATH, "Stonewater - Costed Retrofit Packages 20241030 (WIP) MR Review v1.xlsx"
         ),
         archetypes_sheet_filepath=os.path.join(
             CUSTOMER_FOLDER_PATH, "Stonewater SHDF_3_0_Board Triage 22.05.24 - Archetyped V3.1.xlsx"
@@ -1168,7 +1165,8 @@ def main():
 
 def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepath):
     # We read in the costed packages
-    costed_packages = pd.read_excel(costed_packages_filepath)
+    # Note: Header as 12 is for Matt Ratcliff's reviewed version
+    costed_packages = pd.read_excel(costed_packages_filepath, header=13, sheet_name="Modelled Packages")
 
     archetypes_to_cost = costed_packages[
         [

From 9ad7d3e46f30ee6a24e5d8c81dbd7f1035c04bee Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 4 Nov 2024 11:24:02 +0000
Subject: [PATCH 57/59] added missing windows age extraction

---
 etl/customers/aiha/xml_extraction.py   | 67 ++++++++++++++++++++++++--
 etl/xml_survey_extraction/XmlParser.py |  1 +
 2 files changed, 63 insertions(+), 5 deletions(-)

diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py
index 8c5c9008..7dc516a6 100644
--- a/etl/customers/aiha/xml_extraction.py
+++ b/etl/customers/aiha/xml_extraction.py
@@ -113,6 +113,7 @@ def main():
 
     # TODO: Need AIH001-02 9C Clapton Common
     # TODO: Check which properties are in a conservation area
+    # TODO: AIH001-16 - Is the loft insulation suitable (already has 100mm in the RIR)
 
     recommended_measures = [
         {
@@ -560,6 +561,7 @@ def main():
                 {
                     "measure": "Loft Insulation",
                     "description": "300mm loft insulation",
+                    "floor_area": 20.92,  # Based on floor area of RIR
                     "sap_points": 1,
                     "ending_sap": 65,
                 },
@@ -616,6 +618,27 @@ def main():
                     "ending_sap": 78,
                 }
             ]
+        },
+        {
+            "survey_key": "AIH001-18",
+            "starting_sap": 58,
+            "recommended_measures": [],
+
+        },
+        {
+            "survey_key": "AIH001-19",
+            "starting_sap": 76,
+            "recommended_measures": []
+        },
+        {
+            "survey_key": "AIH001-20",
+            "starting_sap": 82,
+            "recommended_measures": []
+        },
+        {
+            "survey_key": "AIH001-21",
+            "starting_sap": 53,
+            "recommended_measures": []
         }
     ]
 
@@ -648,6 +671,7 @@ def main():
         {'item': '2x DMEV fans', 'unit_price': 1070, 'unit': 'unit'},
         {'item': '1.6kWp Solar PV system', 'unit_price': 3040, 'unit': 'unit_needs_scaffolding'},
         {'item': '2kWp Solar PV system', 'unit_price': 3201, 'unit': 'unit_needs_scaffolding'},
+        {'item': '2.4kWp Solar PV system', 'unit_price': 3363, 'unit': 'unit_needs_scaffolding'},
         {'item': '3.2kWp Solar PV system', 'unit_price': 3686, 'unit': 'unit_needs_scaffolding'},
         {'item': '4kWp Solar PV system', 'unit_price': 4009, 'unit': 'unit_needs_scaffolding'},
         {'item': '5.6kWp Solar PV system', 'unit_price': 5015, 'unit': 'unit_needs_scaffolding'},
@@ -690,8 +714,14 @@ def main():
         total_cost = survey.get("total_cost", 0)
 
         for measure in survey.get("recommended_measures", []):
+            # Include hlp and floor_area for each measure if available
+            hlp = measure.get("hlp", None)
+            floor_area = measure.get("floor_area", None)
+
             normalized_measures.append({
                 "survey_key": survey_key,
+                "hlp": hlp,
+                "floor_area": floor_area,
                 "starting_sap": starting_sap,
                 "measure": measure["measure"],
                 "description": measure.get("description", ""),
@@ -712,16 +742,38 @@ def main():
         fill_value=None
     ).reset_index()
 
+    measures_columns = [x for x in pivoted_measures.columns if x not in ["survey_key"]]
+    # We add a "Cost of" column for each measure
+    for measure in measures_columns:
+        pivoted_measures[f"Cost of {measure}"] = None
+
+    pivoted_floor_area = measures_df.pivot_table(
+        index="survey_key",
+        columns="measure",
+        values="floor_area",
+        aggfunc="first"  # Use 'first' since each measure should only appear once per survey_key
+    ).add_prefix("floor_area - ").reset_index()
+
+    pivoted_hlp = measures_df.pivot_table(
+        index="survey_key",
+        columns="measure",
+        values="hlp",
+        aggfunc="first"
+    ).add_prefix("hlp - ").reset_index()
+
+    # Merge hlp and floor_area data
+    pivoted_measures = pivoted_measures.merge(pivoted_hlp, on="survey_key", how="left")
+    pivoted_measures = pivoted_measures.merge(pivoted_floor_area, on="survey_key", how="left")
+
     # Step 3: Calculate the total sap points and total cost for each survey.
-    sap_cost_totals = measures_df.groupby("survey_key").agg(
+    totals = measures_df.groupby("survey_key").agg(
         total_sap_points=("sap_points", "sum"),
-        total_cost_of_measures=("measure_cost", "sum")
     ).reset_index()
 
     # Merge total sap points into the pivoted measures.
-    pivoted_measures = pd.merge(pivoted_measures, sap_cost_totals, on="survey_key", how="left")
-    pivoted_measures["Cost Contingency"] = pivoted_measures["total_cost_of_measures"] * CONTINGENCY_RATE
-    pivoted_measures["Total Cost"] = pivoted_measures["total_cost_of_measures"] + pivoted_measures["Cost Contingency"]
+    pivoted_measures = pd.merge(pivoted_measures, totals, on="survey_key", how="left")
+    # pivoted_measures["Cost Contingency"] = pivoted_measures["total_cost_of_measures"] * CONTINGENCY_RATE
+    # pivoted_measures["Total Cost"] = pivoted_measures["total_cost_of_measures"] + pivoted_measures["Cost Contingency"]
 
     # Step 4: Extract starting SAP for each survey key.
     starting_sap_df = measures_df.drop_duplicates(subset=["survey_key"])[["survey_key", "starting_sap"]]
@@ -738,5 +790,10 @@ def main():
         result_df, how="left", on="survey_key"
     )
 
+    final_measures.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/Measures packages.csv")
+
+    # Store costs
+    pricing_data.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/Pricing data.csv")
+
 # if __name__ == "__main__":
 #     main()
diff --git a/etl/xml_survey_extraction/XmlParser.py b/etl/xml_survey_extraction/XmlParser.py
index fa70b6b7..ef8daf51 100644
--- a/etl/xml_survey_extraction/XmlParser.py
+++ b/etl/xml_survey_extraction/XmlParser.py
@@ -784,6 +784,7 @@ class XmlParser:
 
         glazing_type_lookup = {
             "ND": "Single glazing",
+            "1": "double glazing installed before 2002",
             "2": "double glazing installed during or after 2002",
             "3": "double glazing, unknown install date",
             "5": "Single glazing",

From 5dc78d6bb9c6b14029488bb27d769967bb4ba658 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 4 Nov 2024 12:14:52 +0000
Subject: [PATCH 58/59] added measures for more properties

---
 etl/customers/aiha/xml_extraction.py | 105 ++++++++++++++++++++++++++-
 1 file changed, 103 insertions(+), 2 deletions(-)

diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py
index 7dc516a6..d193c91e 100644
--- a/etl/customers/aiha/xml_extraction.py
+++ b/etl/customers/aiha/xml_extraction.py
@@ -122,6 +122,33 @@ def main():
             "recommended_measures": [],
             "notes": "Is EPC C"
         },
+        {
+            "survey_key": "AIH001-02",
+            "starting_sap": 65,
+            "recommended_measures": [
+                {
+                    "measure": "Solar PV",
+                    "description": "2.4kWp Solar PV system",
+                    "config": [
+                        {
+                            "size": "2.4W",
+                            "orientation": "Horizontal",
+                            "elavation": 30,
+                            "overshading": "None or little",
+                        }
+                    ],
+                    "sap_points": 7,
+                    "ending_sap": 72,
+                    "notes": "The array can be mounted on the flat roof, so that panels are south facing"
+                },
+                {
+                    "measure": "TTZC",
+                    "description": "Smart Thermostat",
+                    "sap_points": 4,
+                    "ending_sap": 76
+                }
+            ],
+        },
         {
             "survey_key": "AIH001-03",
             "starting_sap": 43,
@@ -622,7 +649,41 @@ def main():
         {
             "survey_key": "AIH001-18",
             "starting_sap": 58,
-            "recommended_measures": [],
+            "recommended_measures": [
+                {
+                    "measure": "Loft Insulation",
+                    "description": "300mm loft insulation",
+                    "floor_area": 37.52,  # Based on area of main building and 1st extension
+                    "sap_points": 7,
+                    "ending_sap": 65,
+                },
+                {
+                    "measure": "Cylinder Insulation",
+                    "description": "80mm cylinder insulation",
+                    "sap_points": 1,
+                    "ending_sap": 66,
+                },
+                {
+                    "measure": "TTZC",
+                    "description": "Smart Thermostat",
+                    "sap_points": 2,
+                    "ending_sap": 68,
+                },
+                {
+                    "measure": "Solar PV",
+                    "description": "3.2kWp Solar PV system",
+                    "config": [
+                        {
+                            "size": "3.2W",
+                            "orientation": "North-East",
+                            "elavation": 30,
+                            "overshading": "None or little",
+                        }
+                    ],
+                    "sap_points": 7,
+                    "ending_sap": 75,
+                }
+            ],
 
         },
         {
@@ -638,7 +699,47 @@ def main():
         {
             "survey_key": "AIH001-21",
             "starting_sap": 53,
-            "recommended_measures": []
+            "recommended_measures": [
+                {
+                    "measure": "Cyliner Insulation",
+                    "description": "80mm cylinder insulation",
+                    "sap_points": 2,
+                    "ending_sap": 55,
+                },
+                {
+                    "measure": "Roof Insulation",
+                    "description": "100mm+ RIR insulation on all surfaces (ceiling u=0.16, walls u=0.3)",
+                    "floor_area": 22.80,  # Based on floor area of RIR
+                    "sap_points": 7,
+                    "ending_sap": 62,
+                },
+                {
+                    "measure": "Solar PV",
+                    "description": "2.4kWp Solar PV system",
+                    "config": [
+                        {
+                            "size": "1.6kWp",
+                            "orientation": "Horizontal",
+                            "elavation": 30,
+                            "overshading": "None or little",
+                        },
+                        {
+                            "size": "0.8kWp",
+                            "orientation": "South-East",
+                            "elavation": 30,
+                            "overshading": "None or little",
+                        }
+                    ],
+                    "sap_points": 9,
+                    "ending_sap": 71,
+                },
+                {
+                    "measure": "TTZC",
+                    "description": "Smart Thermostat",
+                    "sap_points": 3,
+                    "ending_sap": 74,
+                }
+            ]
         }
     ]
 

From b75ae5f6b8de5855fd5278079de009e9a99ceb0e Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 5 Nov 2024 11:34:15 +0000
Subject: [PATCH 59/59] minor

---
 etl/customers/aiha/xml_extraction.py | 122 ++++++++++++++++++++++-----
 1 file changed, 103 insertions(+), 19 deletions(-)

diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py
index d193c91e..531b6752 100644
--- a/etl/customers/aiha/xml_extraction.py
+++ b/etl/customers/aiha/xml_extraction.py
@@ -102,8 +102,6 @@ def main():
     # TODO
     #   - AIH001-03 has a loft that is inaccessible - ask Chenai about why this property didn't have access to the loft
     #     [Can't remember, not clear - Chenai will check]
-    #   - AIH001-04 why couldn't the cylinder be accessed? - treating this could get to the EPC C
-    #       - Potential measure - search for the cylinder and insulate it
     #   - AIH001-08 and AIH001-09, check if it's freehold - could solar work as both of these units are part of the same
     #     buulding [Question for Lewis & Kevin]
     #   - AIH001-09 - Is it not possible to install a loft hatch? [IT IS NOT, NO ACCESS - would need to accessed from
@@ -111,9 +109,9 @@ def main():
     #   - AIH001-09 - Is there definitely an immersion water heater? Is this definitely the case for the other units?
     #       [Question for Lewis & Kevin] - [YES - ASHP!!!!]
 
-    # TODO: Need AIH001-02 9C Clapton Common
     # TODO: Check which properties are in a conservation area
     # TODO: AIH001-16 - Is the loft insulation suitable (already has 100mm in the RIR)
+    # TODO: Adjust Archetype 14 homes to exclude double glazing? Or should we exclude entirely
 
     recommended_measures = [
         {
@@ -376,6 +374,8 @@ def main():
                     "floor_area": 63.59 + 12.31,  # Based on area of main building and 1st extension
                     "sap_points": 8,
                     "ending_sap": 70,
+                    "notes": "Loft is inaccessible from this unit - would need to be accessed from the other unit, "
+                             "which is also owned by AIHA"
                 }
             ],
             "notes": "This property is a house split into 2 flats. We can install a PV array for both units (one array"
@@ -419,31 +419,31 @@ def main():
                     "sap_points": 3,
                     "ending_sap": 49,
                 },
-                {
-                    "measure": "Solar PV",
-                    "description": "3.2kWp Solar PV system",
-                    "config": [
-                        {
-                            "size": "3.2W",
-                            "orientation": "East",
-                            "elavation": 30,
-                            "overshading": "Little or none",
-                        }
-                    ],
-                    "sap_points": 9,
-                    "ending_sap": 58
-                },
+                # {
+                #     "measure": "Solar PV",
+                #     "description": "3.2kWp Solar PV system",
+                #     "config": [
+                #         {
+                #             "size": "3.2W",
+                #             "orientation": "East",
+                #             "elavation": 30,
+                #             "overshading": "Little or none",
+                #         }
+                #     ],
+                #     "sap_points": 9,
+                #     "ending_sap": 58
+                # },
                 {
                     "measure": "Air Source Heat Pump",
                     "description": "Ecoforest ecoAIR EVI 4-20 20kW air source heat pump (+TTZC)",
                     "sap_points": 15,
-                    "ending_sap": 73
+                    "ending_sap": 65
                 },
                 {
                     "measure": "Tariff Review",
                     "description": "Switch to 24-hour tariff",
                     "sap_points": 15,
-                    "ending_sap": 88
+                    "ending_sap": 80
                 }
             ]
         },
@@ -740,6 +740,90 @@ def main():
                     "ending_sap": 74,
                 }
             ]
+        },
+        {
+            "survey_key": "AIH001-SIMULATED-01",
+            "elmhurst_reference": "000020",
+            "starting_sap": None,
+            "recommended_measures": [
+                {
+                    "measure": "Internal Wall Insulation",
+                    "description": "100mm internal wall insulation",
+                    "hlp": (22.35 * 3.24) + (22.13 * 2.53),
+                    "sap_points": 8,
+                    "ending_sap": 52,
+                },
+                {
+                    "measure": "Cavity Wall Insulation",
+                    "description": "CWI to rdSAP default standard",
+                    "hlp": (2.68 * 2.39) + (5.93 * 2.63) + (6.13 * 2.39),  # 1st & 2nd extension
+                    "sap_points": 1,
+                    "ending_sap": 53,
+                },
+                {
+                    "measure": "Ventilation",
+                    "description": "2x DMEV fans",
+                    "sap_points": 0,
+                    "ending_sap": 53,
+                },
+                {
+                    "measure": "TTZC",
+                    "description": "Smart Thermostat",
+                    "sap_points": 3,
+                    "ending_sap": 56,
+                },
+                {
+                    "measure": "Solar PV",
+                    "description": "1.6kWp Solar PV system",
+                    "config": [
+                        {
+                            "size": "1.6W",
+                            "orientation": "South-East",
+                            "elavation": 30,
+                            "overshading": "None or little",
+                        }
+                    ],
+                    "sap_points": 6,
+                    "ending_sap": 62
+                },
+                {
+                    "measure": "Loft Insulation",
+                    "description": "300mm loft insulation",
+                    "floor_area": 63.59 + 12.31,  # Based on area of main building and 1st extension
+                    "sap_points": 8,
+                    "ending_sap": 70,
+                    "notes": "Loft is inaccessible from this unit - would need to be accessed from the other unit, "
+                             "which is also owned by AIHA"
+                }
+            ],
+            "notes": "This was cloned from 80A. There is no existing data for 80B"
+        },
+        {
+            "survey_key": "AIH001-SIMULATED-05",
+            "starting_sap": 68,
+            "recommended_measures": [
+                {
+                    "measure": "Loft Insulation",
+                    "description": "300mm loft insulation",
+                    "floor_area": 42.5,
+                    "sap_points": 1,
+                    "ending_sap": 69,
+                },
+                {
+                    "measure": "Solar PV",
+                    "description": "3.2kWp Solar PV system",
+                    "config": [
+                        {
+                            "size": "3.2W",
+                            "orientation": "North-East",
+                            "elavation": 30,
+                            "overshading": "None or little",
+                        }
+                    ],
+                    "sap_points": 8,
+                    "ending_sap": 77,
+                }
+            ]
         }
     ]