From 8983ebec2fd9ea593f19990f5c02847da4adbc45 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 30 Oct 2024 10:03:10 +0000
Subject: [PATCH] adding epc band

---
 .../stonewater/Wave 3 Preparation.py          | 59 ++++++++++++++++++-
 1 file changed, 56 insertions(+), 3 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index fe1faa9d..2654fae5 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -11,6 +11,32 @@ SURVEY_FOLDERS = os.path.join(CUSTOMER_FOLDER_PATH, "StonewaterSurveys_{i}")
 NUM_FOLDERS = 14
 
 
+def sap_to_epc(sap_points: int | float):
+    """
+    Simple utility function to convert SAP points to EPC rating.
+    :param sap_points: numerical value of SAP points, typically between 0 and 100
+    :return:
+    """
+
+    if sap_points <= 0:
+        raise ValueError("SAP points should be above 0.")
+
+    if sap_points >= 92:
+        return "A"
+    elif sap_points >= 81:
+        return "B"
+    elif sap_points >= 69:
+        return "C"
+    elif sap_points >= 55:
+        return "D"
+    elif sap_points >= 39:
+        return "E"
+    elif sap_points >= 21:
+        return "F"
+    else:
+        return "G"
+
+
 def extract_summary_report(pdf_path):
     """
     Extracts specific data from the provided PDF file.
@@ -23,6 +49,7 @@ def extract_summary_report(pdf_path):
         "Address": None,
         "Postcode": None,
         "Current SAP Rating": None,
+        "Current EPC Band": None,
         "Fuel Bill": None,
         "Number of Storeys": None,
         "Window Age Description": None,
@@ -57,7 +84,7 @@ def extract_summary_report(pdf_path):
 
         # Extract Current SAP rating
         sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text)
-        data["Current SAP Rating"] = sap_match.group(1)
+        data["Current SAP Rating"] = sap_match.group(1).split(" ")[1]
 
         # Number of storeys
         storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text)
@@ -367,6 +394,7 @@ def extract_epr(pdf_path):
         "Address": None,
         "Postcode": None,
         "Current SAP Rating": None,
+        "Current EPC Band": None,
         "Primary Energy Use (kWh/yr)": None,
         "Primary Energy Use Intensity (kWh/m2/yr)": None,
         "Number of Storeys": None,
@@ -621,6 +649,9 @@ def main():
             folder_contents = [os.path.join(f"StonewaterSurveys_{i}", file) for file in os.listdir(folder_path)]
             survey_folders.extend(folder_contents)  # Append contents to the master list
 
+    # Get rid of .DS_Store files
+    survey_folders = [folder for folder in survey_folders if not folder.endswith(".DS_Store")]
+
     extracted_data = []
     for survey_folder in tqdm(survey_folders):
         survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder)
@@ -643,6 +674,16 @@ def main():
                 retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder)
             else:
                 retrofit_folder_path = os.path.join(survey_folder_path, ra_folder)
+
+            # Check if everything inside is a sub-folder and the number of folders is 2
+            items = [item for item in os.listdir(retrofit_folder_path) if item != '.DS_Store']
+            all_folders = [os.path.isdir(os.path.join(retrofit_folder_path, item)) for item in items]
+            if all(all_folders) and len(all_folders) == 2 and "Property Pics" in items:
+                # Get the folder that isn't Property Pics
+                retrofit_folder_path = os.path.join(
+                    retrofit_folder_path, [item for item in items if item != "Property Pics"][0]
+                )
+
             if os.listdir(retrofit_folder_path):  # If not empty
                 summary_data = extract_retrofit_pdfs(data_folder_path=retrofit_folder_path)
                 if summary_data:
@@ -673,14 +714,24 @@ def main():
 
     extracted_data = pd.DataFrame(extracted_data)
 
-    # What was missed???
-
     extracted_data["Primary Energy Use (kWh/yr)"] = (
         extracted_data["Primary Energy Use Intensity (kWh/m2/yr)"] * extracted_data["Total Floor Area (m2)"]
     )
+    extracted_data["Current SAP Rating"] = extracted_data["Current SAP Rating"].astype(int)
+    extracted_data["Current EPC Band"] = extracted_data["Current SAP Rating"].apply(sap_to_epc)
+
     # TODO: Clean up SAP and extract EPC
     # TODO: RIR floor area!!!
 
+    # Remove some definite duplicates
+    extracted_data = extracted_data[
+        ~extracted_data["survey_folder"].isin(
+            [
+                "StonewaterSurveys_10/4 Beech Road, LUTON, LU1 1DP ROSS",
+            ]
+        )
+    ]
+
     # We now merge on the coordinator data so that against each property, we can map the measures
     retrofit_packages_board = pd.read_excel(
         os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater_SHDF_3_0_Board_work_in_progress_- 22.10.24.xlsx"),
@@ -715,9 +766,11 @@ def main():
         filtered = filtered[filtered["survey_folder"].str.contains(home["Name"], case=False)]
         # We have an edge case wher some properties have two outputs in Sharepoint
         if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ":
+            bl1h2
             filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"]
 
         if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB':
+            blah1
             filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"]
 
         if filtered.empty: