mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
adding all surveys and updating creation of filepaths
This commit is contained in:
parent
dbee05e555
commit
791262fa86
1 changed files with 117 additions and 7 deletions
|
|
@ -2,11 +2,13 @@ import os
|
|||
import PyPDF2
|
||||
import re
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
from collections import Counter
|
||||
|
||||
CUSTOMER_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater"
|
||||
FILE_PATH = os.path.join(CUSTOMER_FOLDER_PATH, "Wave 3 Surveys")
|
||||
SURVEY_FOLDERS = os.path.join(CUSTOMER_FOLDER_PATH, "StonewaterSurveys_{i}")
|
||||
NUM_FOLDERS = 14
|
||||
|
||||
|
||||
def extract_summary_report(pdf_path):
|
||||
|
|
@ -610,11 +612,18 @@ def main():
|
|||
This code prepares the data for the Warm Homes: Social Housing Fund Wave 3, for Stonewater.
|
||||
"""
|
||||
# List only directories in the specified FILE_PATH
|
||||
survey_folders = [name for name in os.listdir(FILE_PATH) if os.path.isdir(os.path.join(FILE_PATH, name))]
|
||||
survey_folders = []
|
||||
|
||||
# Loop over each survey folder and list its contents
|
||||
for i in range(1, NUM_FOLDERS + 1):
|
||||
folder_path = os.path.join(CUSTOMER_FOLDER_PATH, f"StonewaterSurveys_{i}")
|
||||
if os.path.isdir(folder_path): # Check if folder exists
|
||||
folder_contents = [os.path.join(f"StonewaterSurveys_{i}", file) for file in os.listdir(folder_path)]
|
||||
survey_folders.extend(folder_contents) # Append contents to the master list
|
||||
|
||||
extracted_data = []
|
||||
for survey_folder in tqdm(survey_folders):
|
||||
survey_folder_path = os.path.join(FILE_PATH, survey_folder)
|
||||
survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder)
|
||||
|
||||
# List the folders inside of the survey folder
|
||||
survey_subfolders = [name for name in os.listdir(survey_folder_path)
|
||||
|
|
@ -623,9 +632,17 @@ def main():
|
|||
# Check if there's a "retrofit assessment" folder
|
||||
retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None)
|
||||
|
||||
ra_folder = next(
|
||||
(name for name in survey_subfolders if "ra coordinator info" in name.lower() or "ra info" in name.lower()),
|
||||
None
|
||||
)
|
||||
|
||||
# If retrofit assessment folder exists, check if it has content
|
||||
if retrofit_folder:
|
||||
retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder)
|
||||
if retrofit_folder or ra_folder:
|
||||
if retrofit_folder:
|
||||
retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder)
|
||||
else:
|
||||
retrofit_folder_path = os.path.join(survey_folder_path, ra_folder)
|
||||
if os.listdir(retrofit_folder_path): # If not empty
|
||||
summary_data = extract_retrofit_pdfs(data_folder_path=retrofit_folder_path)
|
||||
if summary_data:
|
||||
|
|
@ -642,6 +659,11 @@ def main():
|
|||
# If no retrofit folder or it was empty, check files in survey_folder
|
||||
|
||||
summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path)
|
||||
if not summary_data:
|
||||
if len(survey_subfolders) == 1:
|
||||
survey_folder_path = os.path.join(survey_folder_path, survey_subfolders[0])
|
||||
summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path)
|
||||
|
||||
if summary_data:
|
||||
summary_data = {
|
||||
"survey_folder": survey_folder,
|
||||
|
|
@ -650,9 +672,14 @@ def main():
|
|||
extracted_data.append(summary_data)
|
||||
|
||||
extracted_data = pd.DataFrame(extracted_data)
|
||||
|
||||
# What was missed???
|
||||
|
||||
extracted_data["Primary Energy Use (kWh/yr)"] = (
|
||||
extracted_data["Primary Energy Use Intensity (kWh/m2/yr)"] * extracted_data["Total Floor Area (m2)"]
|
||||
)
|
||||
# TODO: Clean up SAP and extract EPC
|
||||
# TODO: RIR floor area!!!
|
||||
|
||||
# We now merge on the coordinator data so that against each property, we can map the measures
|
||||
retrofit_packages_board = pd.read_excel(
|
||||
|
|
@ -663,7 +690,13 @@ def main():
|
|||
# We now match this retrofit packages board to the extracted data
|
||||
matching_lookup = []
|
||||
for _, home in retrofit_packages_board.iterrows():
|
||||
filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()]
|
||||
filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()].copy()
|
||||
|
||||
# We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces
|
||||
filtered = filtered[filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains(
|
||||
home["Name"].replace(r"[^\w\s]", ""), case=False
|
||||
)]
|
||||
|
||||
if filtered.empty:
|
||||
print("Check this once we have full data")
|
||||
continue
|
||||
|
|
@ -684,8 +717,12 @@ def main():
|
|||
if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ":
|
||||
filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"]
|
||||
|
||||
if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB':
|
||||
filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"]
|
||||
|
||||
if filtered.empty:
|
||||
raise Exception("somethign went wrong")
|
||||
print("Check this once we have full data2!!!")
|
||||
continue
|
||||
if filtered.shape[0] != 1:
|
||||
raise Exception("somethign went wrong2")
|
||||
|
||||
|
|
@ -699,6 +736,79 @@ def main():
|
|||
|
||||
matching_lookup = pd.DataFrame(matching_lookup)
|
||||
|
||||
if matching_lookup["Osm. ID"].duplicated().sum():
|
||||
raise Exception("Duplicate Osm. IDs")
|
||||
|
||||
if matching_lookup["survey_folder"].duplicated().sum():
|
||||
raise Exception("Duplicate survey folders")
|
||||
|
||||
measure_columns = [
|
||||
'Main Wall Insulation',
|
||||
'Secondary Wall Insulation',
|
||||
'Loft insulation',
|
||||
'Flat Roof',
|
||||
'Room in Roof',
|
||||
'Window Upgrade',
|
||||
'Door Upgrade',
|
||||
'Ventilation',
|
||||
'Main Heating',
|
||||
'Water Heating',
|
||||
'Heating Controls',
|
||||
'Solar PV',
|
||||
'Other measures'
|
||||
]
|
||||
|
||||
# We should end up with a 1:1 mapping between the Osm. ID and the survey folder
|
||||
stonewater_data = extracted_data.merge(matching_lookup, on="survey_folder", how="left").merge(
|
||||
retrofit_packages_board[
|
||||
[
|
||||
"Name",
|
||||
"Osm. ID",
|
||||
"Address ID",
|
||||
"Archetype ID",
|
||||
"Arch. Group Rank", "Archetype Representative",
|
||||
"Actual SAP Band",
|
||||
"Actual SAP Rating",
|
||||
"Modelled SAP Band",
|
||||
"Modelled SAP Rating",
|
||||
] + measure_columns
|
||||
],
|
||||
on=["Osm. ID", "Name"],
|
||||
how="left"
|
||||
)
|
||||
|
||||
# We've appended the recommended packages and modelled SAP ratings to the data
|
||||
# We also want to append the windows data
|
||||
windows_data = pd.read_excel(
|
||||
os.path.join(
|
||||
CUSTOMER_FOLDER_PATH,
|
||||
"Window data included AP Copy Stonewater SHDF_3_0_Board Triage Master Filtered 26.07.24.xlsx"
|
||||
),
|
||||
header=12
|
||||
)
|
||||
|
||||
# We get a lookup id of Osm.ID and when the windows were fitted
|
||||
windows_data = windows_data[
|
||||
["Osm. ID", "Window attributes - Fitted/renewed date", "Parent Asset Window attributes - Fitted/renewed date"]
|
||||
]
|
||||
# Convert to string for the moment
|
||||
windows_data["Parent Asset Window attributes - Fitted/renewed date"] = windows_data[
|
||||
"Parent Asset Window attributes - Fitted/renewed date"
|
||||
].astype(str)
|
||||
# Create a single date column
|
||||
windows_data["Fitted/renewed date"] = np.where(
|
||||
pd.notnull(windows_data["Window attributes - Fitted/renewed date"]),
|
||||
windows_data["Window attributes - Fitted/renewed date"],
|
||||
windows_data["Parent Asset Window attributes - Fitted/renewed date"]
|
||||
)
|
||||
# Convert to a date
|
||||
windows_data["Fitted/renewed date"] = pd.to_datetime(windows_data["Fitted/renewed date"])
|
||||
# Calculate the number of years since something was done on the windows
|
||||
windows_data["Years since fitted/renewed"] = (pd.Timestamp.now() - windows_data[
|
||||
"Fitted/renewed date"]).dt.days / 365
|
||||
|
||||
# TODO: Flag if a package includes windows
|
||||
|
||||
# Save this as a csv
|
||||
# extracted_data.to_csv("Wave 3 Summary Data - first 200 files.csv", index=False)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue