adding all surveys and updating creation of filepaths

This commit is contained in:
Khalim Conn-Kowlessar 2024-10-30 09:29:11 +00:00
parent dbee05e555
commit 791262fa86

View file

@ -2,11 +2,13 @@ import os
import PyPDF2
import re
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import Counter
CUSTOMER_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater"
FILE_PATH = os.path.join(CUSTOMER_FOLDER_PATH, "Wave 3 Surveys")
SURVEY_FOLDERS = os.path.join(CUSTOMER_FOLDER_PATH, "StonewaterSurveys_{i}")
NUM_FOLDERS = 14
def extract_summary_report(pdf_path):
@ -610,11 +612,18 @@ def main():
This code prepares the data for the Warm Homes: Social Housing Fund Wave 3, for Stonewater.
"""
# List only directories in the specified FILE_PATH
survey_folders = [name for name in os.listdir(FILE_PATH) if os.path.isdir(os.path.join(FILE_PATH, name))]
survey_folders = []
# Loop over each survey folder and list its contents
for i in range(1, NUM_FOLDERS + 1):
folder_path = os.path.join(CUSTOMER_FOLDER_PATH, f"StonewaterSurveys_{i}")
if os.path.isdir(folder_path): # Check if folder exists
folder_contents = [os.path.join(f"StonewaterSurveys_{i}", file) for file in os.listdir(folder_path)]
survey_folders.extend(folder_contents) # Append contents to the master list
extracted_data = []
for survey_folder in tqdm(survey_folders):
survey_folder_path = os.path.join(FILE_PATH, survey_folder)
survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder)
# List the folders inside of the survey folder
survey_subfolders = [name for name in os.listdir(survey_folder_path)
@ -623,9 +632,17 @@ def main():
# Check if there's a "retrofit assessment" folder
retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None)
ra_folder = next(
(name for name in survey_subfolders if "ra coordinator info" in name.lower() or "ra info" in name.lower()),
None
)
# If retrofit assessment folder exists, check if it has content
if retrofit_folder:
retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder)
if retrofit_folder or ra_folder:
if retrofit_folder:
retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder)
else:
retrofit_folder_path = os.path.join(survey_folder_path, ra_folder)
if os.listdir(retrofit_folder_path): # If not empty
summary_data = extract_retrofit_pdfs(data_folder_path=retrofit_folder_path)
if summary_data:
@ -642,6 +659,11 @@ def main():
# If no retrofit folder or it was empty, check files in survey_folder
summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path)
if not summary_data:
if len(survey_subfolders) == 1:
survey_folder_path = os.path.join(survey_folder_path, survey_subfolders[0])
summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path)
if summary_data:
summary_data = {
"survey_folder": survey_folder,
@ -650,9 +672,14 @@ def main():
extracted_data.append(summary_data)
extracted_data = pd.DataFrame(extracted_data)
# What was missed???
extracted_data["Primary Energy Use (kWh/yr)"] = (
extracted_data["Primary Energy Use Intensity (kWh/m2/yr)"] * extracted_data["Total Floor Area (m2)"]
)
# TODO: Clean up SAP and extract EPC
# TODO: RIR floor area!!!
# We now merge on the coordinator data so that against each property, we can map the measures
retrofit_packages_board = pd.read_excel(
@ -663,7 +690,13 @@ def main():
# We now match this retrofit packages board to the extracted data
matching_lookup = []
for _, home in retrofit_packages_board.iterrows():
filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()]
filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()].copy()
# We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces
filtered = filtered[filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains(
home["Name"].replace(r"[^\w\s]", ""), case=False
)]
if filtered.empty:
print("Check this once we have full data")
continue
@ -684,8 +717,12 @@ def main():
if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ":
filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"]
if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB':
filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"]
if filtered.empty:
raise Exception("somethign went wrong")
print("Check this once we have full data2!!!")
continue
if filtered.shape[0] != 1:
raise Exception("somethign went wrong2")
@ -699,6 +736,79 @@ def main():
matching_lookup = pd.DataFrame(matching_lookup)
if matching_lookup["Osm. ID"].duplicated().sum():
raise Exception("Duplicate Osm. IDs")
if matching_lookup["survey_folder"].duplicated().sum():
raise Exception("Duplicate survey folders")
measure_columns = [
'Main Wall Insulation',
'Secondary Wall Insulation',
'Loft insulation',
'Flat Roof',
'Room in Roof',
'Window Upgrade',
'Door Upgrade',
'Ventilation',
'Main Heating',
'Water Heating',
'Heating Controls',
'Solar PV',
'Other measures'
]
# We should end up with a 1:1 mapping between the Osm. ID and the survey folder
stonewater_data = extracted_data.merge(matching_lookup, on="survey_folder", how="left").merge(
retrofit_packages_board[
[
"Name",
"Osm. ID",
"Address ID",
"Archetype ID",
"Arch. Group Rank", "Archetype Representative",
"Actual SAP Band",
"Actual SAP Rating",
"Modelled SAP Band",
"Modelled SAP Rating",
] + measure_columns
],
on=["Osm. ID", "Name"],
how="left"
)
# We've appended the recommended packages and modelled SAP ratings to the data
# We also want to append the windows data
windows_data = pd.read_excel(
os.path.join(
CUSTOMER_FOLDER_PATH,
"Window data included AP Copy Stonewater SHDF_3_0_Board Triage Master Filtered 26.07.24.xlsx"
),
header=12
)
# We get a lookup id of Osm.ID and when the windows were fitted
windows_data = windows_data[
["Osm. ID", "Window attributes - Fitted/renewed date", "Parent Asset Window attributes - Fitted/renewed date"]
]
# Convert to string for the moment
windows_data["Parent Asset Window attributes - Fitted/renewed date"] = windows_data[
"Parent Asset Window attributes - Fitted/renewed date"
].astype(str)
# Create a single date column
windows_data["Fitted/renewed date"] = np.where(
pd.notnull(windows_data["Window attributes - Fitted/renewed date"]),
windows_data["Window attributes - Fitted/renewed date"],
windows_data["Parent Asset Window attributes - Fitted/renewed date"]
)
# Convert to a date
windows_data["Fitted/renewed date"] = pd.to_datetime(windows_data["Fitted/renewed date"])
# Calculate the number of years since something was done on the windows
windows_data["Years since fitted/renewed"] = (pd.Timestamp.now() - windows_data[
"Fitted/renewed date"]).dt.days / 365
# TODO: Flag if a package includes windows
# Save this as a csv
# extracted_data.to_csv("Wave 3 Summary Data - first 200 files.csv", index=False)