From 791262fa866e420cef6a2eced9b4f4ec28897409 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 30 Oct 2024 09:29:11 +0000
Subject: [PATCH] adding all surveys and updating creation of filepaths

---
 .../stonewater/Wave 3 Preparation.py          | 124 +++++++++++++++++-
 1 file changed, 117 insertions(+), 7 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index d90360aa..fe1faa9d 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -2,11 +2,13 @@ import os
 import PyPDF2
 import re
 import pandas as pd
+import numpy as np
 from tqdm import tqdm
 from collections import Counter
 
 CUSTOMER_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater"
-FILE_PATH = os.path.join(CUSTOMER_FOLDER_PATH, "Wave 3 Surveys")
+SURVEY_FOLDERS = os.path.join(CUSTOMER_FOLDER_PATH, "StonewaterSurveys_{i}")
+NUM_FOLDERS = 14
 
 
 def extract_summary_report(pdf_path):
@@ -610,11 +612,18 @@ def main():
     This code prepares the data for the Warm Homes: Social Housing Fund Wave 3, for Stonewater.
     """
     # List only directories in the specified FILE_PATH
-    survey_folders = [name for name in os.listdir(FILE_PATH) if os.path.isdir(os.path.join(FILE_PATH, name))]
+    survey_folders = []
+
+    # Loop over each survey folder and list its contents
+    for i in range(1, NUM_FOLDERS + 1):
+        folder_path = os.path.join(CUSTOMER_FOLDER_PATH, f"StonewaterSurveys_{i}")
+        if os.path.isdir(folder_path):  # Check if folder exists
+            folder_contents = [os.path.join(f"StonewaterSurveys_{i}", file) for file in os.listdir(folder_path)]
+            survey_folders.extend(folder_contents)  # Append contents to the master list
 
     extracted_data = []
     for survey_folder in tqdm(survey_folders):
-        survey_folder_path = os.path.join(FILE_PATH, survey_folder)
+        survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder)
 
         # List the folders inside of the survey folder
         survey_subfolders = [name for name in os.listdir(survey_folder_path)
@@ -623,9 +632,17 @@ def main():
         # Check if there's a "retrofit assessment" folder
         retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None)
 
+        ra_folder = next(
+            (name for name in survey_subfolders if "ra coordinator info" in name.lower() or "ra info" in name.lower()),
+            None
+        )
+
         # If retrofit assessment folder exists, check if it has content
-        if retrofit_folder:
-            retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder)
+        if retrofit_folder or ra_folder:
+            if retrofit_folder:
+                retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder)
+            else:
+                retrofit_folder_path = os.path.join(survey_folder_path, ra_folder)
             if os.listdir(retrofit_folder_path):  # If not empty
                 summary_data = extract_retrofit_pdfs(data_folder_path=retrofit_folder_path)
                 if summary_data:
@@ -642,6 +659,11 @@ def main():
         # If no retrofit folder or it was empty, check files in survey_folder
 
         summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path)
+        if not summary_data:
+            if len(survey_subfolders) == 1:
+                survey_folder_path = os.path.join(survey_folder_path, survey_subfolders[0])
+                summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path)
+
         if summary_data:
             summary_data = {
                 "survey_folder": survey_folder,
@@ -650,9 +672,14 @@ def main():
             extracted_data.append(summary_data)
 
     extracted_data = pd.DataFrame(extracted_data)
+
+    # What was missed???
+
     extracted_data["Primary Energy Use (kWh/yr)"] = (
         extracted_data["Primary Energy Use Intensity (kWh/m2/yr)"] * extracted_data["Total Floor Area (m2)"]
     )
+    # TODO: Clean up SAP and extract EPC
+    # TODO: RIR floor area!!!
 
     # We now merge on the coordinator data so that against each property, we can map the measures
     retrofit_packages_board = pd.read_excel(
@@ -663,7 +690,13 @@ def main():
     # We now match this retrofit packages board to the extracted data
     matching_lookup = []
     for _, home in retrofit_packages_board.iterrows():
-        filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()]
+        filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()].copy()
+
+        # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces
+        filtered = filtered[filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains(
+            home["Name"].replace(r"[^\w\s]", ""), case=False
+        )]
+
         if filtered.empty:
             print("Check this once we have full data")
             continue
@@ -684,8 +717,12 @@ def main():
         if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ":
             filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"]
 
+        if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB':
+            filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"]
+
         if filtered.empty:
-            raise Exception("somethign went wrong")
+            print("Check this once we have full data2!!!")
+            continue
         if filtered.shape[0] != 1:
             raise Exception("somethign went wrong2")
 
@@ -699,6 +736,79 @@ def main():
 
     matching_lookup = pd.DataFrame(matching_lookup)
 
+    if matching_lookup["Osm. ID"].duplicated().sum():
+        raise Exception("Duplicate Osm. IDs")
+
+    if matching_lookup["survey_folder"].duplicated().sum():
+        raise Exception("Duplicate survey folders")
+
+    measure_columns = [
+        'Main Wall Insulation',
+        'Secondary Wall Insulation',
+        'Loft insulation',
+        'Flat Roof',
+        'Room in Roof',
+        'Window Upgrade',
+        'Door Upgrade',
+        'Ventilation',
+        'Main Heating',
+        'Water Heating',
+        'Heating Controls',
+        'Solar PV',
+        'Other measures'
+    ]
+
+    # We should end up with a 1:1 mapping between the Osm. ID and the survey folder
+    stonewater_data = extracted_data.merge(matching_lookup, on="survey_folder", how="left").merge(
+        retrofit_packages_board[
+            [
+                "Name",
+                "Osm. ID",
+                "Address ID",
+                "Archetype ID",
+                "Arch. Group Rank", "Archetype Representative",
+                "Actual SAP Band",
+                "Actual SAP Rating",
+                "Modelled SAP Band",
+                "Modelled SAP Rating",
+            ] + measure_columns
+            ],
+        on=["Osm. ID", "Name"],
+        how="left"
+    )
+
+    # We've appended the recommended packages and modelled SAP ratings to the data
+    # We also want to append the windows data
+    windows_data = pd.read_excel(
+        os.path.join(
+            CUSTOMER_FOLDER_PATH,
+            "Window data included AP Copy Stonewater SHDF_3_0_Board Triage Master Filtered 26.07.24.xlsx"
+        ),
+        header=12
+    )
+
+    # We get a lookup id of Osm.ID and when the windows were fitted
+    windows_data = windows_data[
+        ["Osm. ID", "Window attributes - Fitted/renewed date", "Parent Asset Window attributes - Fitted/renewed date"]
+    ]
+    # Convert to string for the moment
+    windows_data["Parent Asset Window attributes - Fitted/renewed date"] = windows_data[
+        "Parent Asset Window attributes - Fitted/renewed date"
+    ].astype(str)
+    # Create a single date column
+    windows_data["Fitted/renewed date"] = np.where(
+        pd.notnull(windows_data["Window attributes - Fitted/renewed date"]),
+        windows_data["Window attributes - Fitted/renewed date"],
+        windows_data["Parent Asset Window attributes - Fitted/renewed date"]
+    )
+    # Convert to a date
+    windows_data["Fitted/renewed date"] = pd.to_datetime(windows_data["Fitted/renewed date"])
+    # Calculate the number of years since something was done on the windows
+    windows_data["Years since fitted/renewed"] = (pd.Timestamp.now() - windows_data[
+        "Fitted/renewed date"]).dt.days / 365
+
+    # TODO: Flag if a package includes windows
+
     # Save this as a csv
     # extracted_data.to_csv("Wave 3 Summary Data - first 200 files.csv", index=False)