From 791262fa866e420cef6a2eced9b4f4ec28897409 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 30 Oct 2024 09:29:11 +0000 Subject: [PATCH] adding all surveys and updating creation of filepaths --- .../stonewater/Wave 3 Preparation.py | 124 +++++++++++++++++- 1 file changed, 117 insertions(+), 7 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index d90360aa..fe1faa9d 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -2,11 +2,13 @@ import os import PyPDF2 import re import pandas as pd +import numpy as np from tqdm import tqdm from collections import Counter CUSTOMER_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater" -FILE_PATH = os.path.join(CUSTOMER_FOLDER_PATH, "Wave 3 Surveys") +SURVEY_FOLDERS = os.path.join(CUSTOMER_FOLDER_PATH, "StonewaterSurveys_{i}") +NUM_FOLDERS = 14 def extract_summary_report(pdf_path): @@ -610,11 +612,18 @@ def main(): This code prepares the data for the Warm Homes: Social Housing Fund Wave 3, for Stonewater. """ # List only directories in the specified FILE_PATH - survey_folders = [name for name in os.listdir(FILE_PATH) if os.path.isdir(os.path.join(FILE_PATH, name))] + survey_folders = [] + + # Loop over each survey folder and list its contents + for i in range(1, NUM_FOLDERS + 1): + folder_path = os.path.join(CUSTOMER_FOLDER_PATH, f"StonewaterSurveys_{i}") + if os.path.isdir(folder_path): # Check if folder exists + folder_contents = [os.path.join(f"StonewaterSurveys_{i}", file) for file in os.listdir(folder_path)] + survey_folders.extend(folder_contents) # Append contents to the master list extracted_data = [] for survey_folder in tqdm(survey_folders): - survey_folder_path = os.path.join(FILE_PATH, survey_folder) + survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder) # List the folders inside of the survey folder survey_subfolders = [name for name in os.listdir(survey_folder_path) @@ -623,9 +632,17 @@ def main(): # Check if there's a "retrofit assessment" folder retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None) + ra_folder = next( + (name for name in survey_subfolders if "ra coordinator info" in name.lower() or "ra info" in name.lower()), + None + ) + # If retrofit assessment folder exists, check if it has content - if retrofit_folder: - retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder) + if retrofit_folder or ra_folder: + if retrofit_folder: + retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder) + else: + retrofit_folder_path = os.path.join(survey_folder_path, ra_folder) if os.listdir(retrofit_folder_path): # If not empty summary_data = extract_retrofit_pdfs(data_folder_path=retrofit_folder_path) if summary_data: @@ -642,6 +659,11 @@ def main(): # If no retrofit folder or it was empty, check files in survey_folder summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path) + if not summary_data: + if len(survey_subfolders) == 1: + survey_folder_path = os.path.join(survey_folder_path, survey_subfolders[0]) + summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path) + if summary_data: summary_data = { "survey_folder": survey_folder, @@ -650,9 +672,14 @@ def main(): extracted_data.append(summary_data) extracted_data = pd.DataFrame(extracted_data) + + # What was missed??? + extracted_data["Primary Energy Use (kWh/yr)"] = ( extracted_data["Primary Energy Use Intensity (kWh/m2/yr)"] * extracted_data["Total Floor Area (m2)"] ) + # TODO: Clean up SAP and extract EPC + # TODO: RIR floor area!!! # We now merge on the coordinator data so that against each property, we can map the measures retrofit_packages_board = pd.read_excel( @@ -663,7 +690,13 @@ def main(): # We now match this retrofit packages board to the extracted data matching_lookup = [] for _, home in retrofit_packages_board.iterrows(): - filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()] + filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()].copy() + + # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces + filtered = filtered[filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains( + home["Name"].replace(r"[^\w\s]", ""), case=False + )] + if filtered.empty: print("Check this once we have full data") continue @@ -684,8 +717,12 @@ def main(): if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ": filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"] + if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB': + filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"] + if filtered.empty: - raise Exception("somethign went wrong") + print("Check this once we have full data2!!!") + continue if filtered.shape[0] != 1: raise Exception("somethign went wrong2") @@ -699,6 +736,79 @@ def main(): matching_lookup = pd.DataFrame(matching_lookup) + if matching_lookup["Osm. ID"].duplicated().sum(): + raise Exception("Duplicate Osm. IDs") + + if matching_lookup["survey_folder"].duplicated().sum(): + raise Exception("Duplicate survey folders") + + measure_columns = [ + 'Main Wall Insulation', + 'Secondary Wall Insulation', + 'Loft insulation', + 'Flat Roof', + 'Room in Roof', + 'Window Upgrade', + 'Door Upgrade', + 'Ventilation', + 'Main Heating', + 'Water Heating', + 'Heating Controls', + 'Solar PV', + 'Other measures' + ] + + # We should end up with a 1:1 mapping between the Osm. ID and the survey folder + stonewater_data = extracted_data.merge(matching_lookup, on="survey_folder", how="left").merge( + retrofit_packages_board[ + [ + "Name", + "Osm. ID", + "Address ID", + "Archetype ID", + "Arch. Group Rank", "Archetype Representative", + "Actual SAP Band", + "Actual SAP Rating", + "Modelled SAP Band", + "Modelled SAP Rating", + ] + measure_columns + ], + on=["Osm. ID", "Name"], + how="left" + ) + + # We've appended the recommended packages and modelled SAP ratings to the data + # We also want to append the windows data + windows_data = pd.read_excel( + os.path.join( + CUSTOMER_FOLDER_PATH, + "Window data included AP Copy Stonewater SHDF_3_0_Board Triage Master Filtered 26.07.24.xlsx" + ), + header=12 + ) + + # We get a lookup id of Osm.ID and when the windows were fitted + windows_data = windows_data[ + ["Osm. ID", "Window attributes - Fitted/renewed date", "Parent Asset Window attributes - Fitted/renewed date"] + ] + # Convert to string for the moment + windows_data["Parent Asset Window attributes - Fitted/renewed date"] = windows_data[ + "Parent Asset Window attributes - Fitted/renewed date" + ].astype(str) + # Create a single date column + windows_data["Fitted/renewed date"] = np.where( + pd.notnull(windows_data["Window attributes - Fitted/renewed date"]), + windows_data["Window attributes - Fitted/renewed date"], + windows_data["Parent Asset Window attributes - Fitted/renewed date"] + ) + # Convert to a date + windows_data["Fitted/renewed date"] = pd.to_datetime(windows_data["Fitted/renewed date"]) + # Calculate the number of years since something was done on the windows + windows_data["Years since fitted/renewed"] = (pd.Timestamp.now() - windows_data[ + "Fitted/renewed date"]).dt.days / 365 + + # TODO: Flag if a package includes windows + # Save this as a csv # extracted_data.to_csv("Wave 3 Summary Data - first 200 files.csv", index=False)