adding all surveys and updating creation of filepaths

2026-07-27 23:35:01 +00:00 · 2024-10-30 09:29:11 +00:00 · 2024-10-30 09:29:11 +00:00 · 791262fa86
commit 791262fa86
parent dbee05e555
1 changed files with 117 additions and 7 deletions
--- a/etl/customers/stonewater/Wave
+++ b/etl/customers/stonewater/Wave
@ -2,11 +2,13 @@ import os
 import PyPDF2
 import re
 import pandas as pd
+import numpy as np
 from tqdm import tqdm
 from collections import Counter

 CUSTOMER_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater"
-FILE_PATH = os.path.join(CUSTOMER_FOLDER_PATH, "Wave 3 Surveys")
+SURVEY_FOLDERS = os.path.join(CUSTOMER_FOLDER_PATH, "StonewaterSurveys_{i}")
+NUM_FOLDERS = 14


 def extract_summary_report(pdf_path):
@ -610,11 +612,18 @@ def main():
    This code prepares the data for the Warm Homes: Social Housing Fund Wave 3, for Stonewater.
    """
    # List only directories in the specified FILE_PATH
-    survey_folders = [name for name in os.listdir(FILE_PATH) if os.path.isdir(os.path.join(FILE_PATH, name))]
+    survey_folders = []
+
+    # Loop over each survey folder and list its contents
+    for i in range(1, NUM_FOLDERS + 1):
+        folder_path = os.path.join(CUSTOMER_FOLDER_PATH, f"StonewaterSurveys_{i}")
+        if os.path.isdir(folder_path):  # Check if folder exists
+            folder_contents = [os.path.join(f"StonewaterSurveys_{i}", file) for file in os.listdir(folder_path)]
+            survey_folders.extend(folder_contents)  # Append contents to the master list

    extracted_data = []
    for survey_folder in tqdm(survey_folders):
-        survey_folder_path = os.path.join(FILE_PATH, survey_folder)
+        survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder)

        # List the folders inside of the survey folder
        survey_subfolders = [name for name in os.listdir(survey_folder_path)
@ -623,9 +632,17 @@ def main():
        # Check if there's a "retrofit assessment" folder
        retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None)

+        ra_folder = next(
+            (name for name in survey_subfolders if "ra coordinator info" in name.lower() or "ra info" in name.lower()),
+            None
+        )
+
        # If retrofit assessment folder exists, check if it has content
-        if retrofit_folder:
-            retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder)
+        if retrofit_folder or ra_folder:
+            if retrofit_folder:
+                retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder)
+            else:
+                retrofit_folder_path = os.path.join(survey_folder_path, ra_folder)
            if os.listdir(retrofit_folder_path):  # If not empty
                summary_data = extract_retrofit_pdfs(data_folder_path=retrofit_folder_path)
                if summary_data:
@ -642,6 +659,11 @@ def main():
        # If no retrofit folder or it was empty, check files in survey_folder

        summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path)
+        if not summary_data:
+            if len(survey_subfolders) == 1:
+                survey_folder_path = os.path.join(survey_folder_path, survey_subfolders[0])
+                summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path)
+
        if summary_data:
            summary_data = {
                "survey_folder": survey_folder,
@ -650,9 +672,14 @@ def main():
            extracted_data.append(summary_data)

    extracted_data = pd.DataFrame(extracted_data)
+
+    # What was missed???
+
    extracted_data["Primary Energy Use (kWh/yr)"] = (
        extracted_data["Primary Energy Use Intensity (kWh/m2/yr)"] * extracted_data["Total Floor Area (m2)"]
    )
+    # TODO: Clean up SAP and extract EPC
+    # TODO: RIR floor area!!!

    # We now merge on the coordinator data so that against each property, we can map the measures
    retrofit_packages_board = pd.read_excel(
@ -663,7 +690,13 @@ def main():
    # We now match this retrofit packages board to the extracted data
    matching_lookup = []
    for _, home in retrofit_packages_board.iterrows():
-        filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()]
+        filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()].copy()
+
+        # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces
+        filtered = filtered[filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains(
+            home["Name"].replace(r"[^\w\s]", ""), case=False
+        )]
+
        if filtered.empty:
            print("Check this once we have full data")
            continue
@ -684,8 +717,12 @@ def main():
        if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ":
            filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"]

+        if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB':
+            filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"]
+
        if filtered.empty:
-            raise Exception("somethign went wrong")
+            print("Check this once we have full data2!!!")
+            continue
        if filtered.shape[0] != 1:
            raise Exception("somethign went wrong2")

@ -699,6 +736,79 @@ def main():

    matching_lookup = pd.DataFrame(matching_lookup)

+    if matching_lookup["Osm. ID"].duplicated().sum():
+        raise Exception("Duplicate Osm. IDs")
+
+    if matching_lookup["survey_folder"].duplicated().sum():
+        raise Exception("Duplicate survey folders")
+
+    measure_columns = [
+        'Main Wall Insulation',
+        'Secondary Wall Insulation',
+        'Loft insulation',
+        'Flat Roof',
+        'Room in Roof',
+        'Window Upgrade',
+        'Door Upgrade',
+        'Ventilation',
+        'Main Heating',
+        'Water Heating',
+        'Heating Controls',
+        'Solar PV',
+        'Other measures'
+    ]
+
+    # We should end up with a 1:1 mapping between the Osm. ID and the survey folder
+    stonewater_data = extracted_data.merge(matching_lookup, on="survey_folder", how="left").merge(
+        retrofit_packages_board[
+            [
+                "Name",
+                "Osm. ID",
+                "Address ID",
+                "Archetype ID",
+                "Arch. Group Rank", "Archetype Representative",
+                "Actual SAP Band",
+                "Actual SAP Rating",
+                "Modelled SAP Band",
+                "Modelled SAP Rating",
+            ] + measure_columns
+            ],
+        on=["Osm. ID", "Name"],
+        how="left"
+    )
+
+    # We've appended the recommended packages and modelled SAP ratings to the data
+    # We also want to append the windows data
+    windows_data = pd.read_excel(
+        os.path.join(
+            CUSTOMER_FOLDER_PATH,
+            "Window data included AP Copy Stonewater SHDF_3_0_Board Triage Master Filtered 26.07.24.xlsx"
+        ),
+        header=12
+    )
+
+    # We get a lookup id of Osm.ID and when the windows were fitted
+    windows_data = windows_data[
+        ["Osm. ID", "Window attributes - Fitted/renewed date", "Parent Asset Window attributes - Fitted/renewed date"]
+    ]
+    # Convert to string for the moment
+    windows_data["Parent Asset Window attributes - Fitted/renewed date"] = windows_data[
+        "Parent Asset Window attributes - Fitted/renewed date"
+    ].astype(str)
+    # Create a single date column
+    windows_data["Fitted/renewed date"] = np.where(
+        pd.notnull(windows_data["Window attributes - Fitted/renewed date"]),
+        windows_data["Window attributes - Fitted/renewed date"],
+        windows_data["Parent Asset Window attributes - Fitted/renewed date"]
+    )
+    # Convert to a date
+    windows_data["Fitted/renewed date"] = pd.to_datetime(windows_data["Fitted/renewed date"])
+    # Calculate the number of years since something was done on the windows
+    windows_data["Years since fitted/renewed"] = (pd.Timestamp.now() - windows_data[
+        "Fitted/renewed date"]).dt.days / 365
+
+    # TODO: Flag if a package includes windows
+
    # Save this as a csv
    # extracted_data.to_csv("Wave 3 Summary Data - first 200 files.csv", index=False)