setting up the stonewater assessment extraction process

This commit is contained in:
Khalim Conn-Kowlessar 2025-01-28 18:13:07 +00:00
parent 11a4bc24a1
commit 86deed8115
2 changed files with 112 additions and 5 deletions

View file

@ -1 +0,0 @@
# The address we're looking from for the remote assessments is Natwest House, Shenley Rd, Borehamwood WD6 1DL

View file

@ -2928,14 +2928,122 @@ def revised_model():
original_archetypes = original_archetypes[original_archetypes["Address ID"] != "Address ID"]
original_archetypes["Address ID"] = original_archetypes["Address ID"].astype(int)
original_archetypes = original_archetypes[
["Address ID", "Archetype ID", ""]
]
# Check if we have all of the addresses
missed = original_archetypes[
~original_archetypes["Address ID"].isin(new_priority_postcodes["Address ID"].values)
]["Archetype ID"].unique()
assert set(missed) == {'NOT PRIORITY POSTCODE', 'IN WAVE 2.1', 'EPC C OR ABOVE'}
original_archetypes = original_archetypes[
["Address ID", "Archetype ID", "Archetype Group Rank"]
]
# Merge these archetypes on to the new priority postcodes
new_priority_postcodes = new_priority_postcodes.merge(
original_archetypes, how="left", on="Address ID"
)
# Basic check, should have no rows with missing Archetype ID, where
assert float(new_priority_postcodes[pd.isnull(new_priority_postcodes["Archetype ID"])]["Address ID"].isin(
original_archetypes["Address ID"]
).sum()) == 0
# We pull together the survey data sheet
survey_folders = []
# Loop over each survey folder and list its contents
for i in range(1, NUM_FOLDERS + 1):
folder_path = os.path.join(CUSTOMER_FOLDER_PATH, f"StonewaterSurveys_{i}")
if os.path.isdir(folder_path): # Check if folder exists
folder_contents = [os.path.join(f"StonewaterSurveys_{i}", file) for file in os.listdir(folder_path)]
survey_folders.extend(folder_contents) # Append contents to the master list
wave_21_folders = [
"1. Herefordshire",
"2. Bedfordshire",
"3. Wiltshire",
"4. Bournemouth",
"5. Coventry",
"6. West Sussex",
"7. Dorset",
"8. Cambridgeshire",
"9. Guildford",
"10. Little Island",
"11. CCS Dorset"
]
for wave_2_1_folder in wave_21_folders:
folder_path = os.path.join(CUSTOMER_FOLDER_PATH, "Wave 2.1 Surveys", wave_2_1_folder)
if os.path.isdir(folder_path): # Check if folder exists
folder_contents = [os.path.join("Wave 2.1 Surveys", wave_2_1_folder, file) for file in
os.listdir(folder_path)]
survey_folders.extend(folder_contents) # Append contents to the master list
# We now do a large pull of all of the data
extracted_data = []
for survey_folder in tqdm(survey_folders):
survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder)
# List the folders inside of the survey folder
survey_subfolders = [
name for name in os.listdir(survey_folder_path)
if os.path.isdir(os.path.join(survey_folder_path, name))
]
# Check if there's a "retrofit assessment" folder
retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None)
ra_folder = next(
(name for name in survey_subfolders if "ra coordinator info" in name.lower() or "ra info" in name.lower()),
None
)
# If retrofit assessment folder exists, check if it has content
if retrofit_folder or ra_folder:
if retrofit_folder:
retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder)
else:
retrofit_folder_path = os.path.join(survey_folder_path, ra_folder)
# Check if everything inside is a sub-folder and the number of folders is 2
items = [item for item in os.listdir(retrofit_folder_path) if item != '.DS_Store']
all_folders = [os.path.isdir(os.path.join(retrofit_folder_path, item)) for item in items]
if all(all_folders) and len(all_folders) == 2 and "Property Pics" in items:
# Get the folder that isn't Property Pics
retrofit_folder_path = os.path.join(
retrofit_folder_path, [item for item in items if item != "Property Pics"][0]
)
if os.listdir(retrofit_folder_path): # If not empty
summary_data = extract_retrofit_pdfs(data_folder_path=retrofit_folder_path)
if summary_data:
summary_data = {
"survey_folder": survey_folder,
**summary_data,
}
extracted_data.append(summary_data)
continue
else:
# Then we have an empty Retrofit Assessment folder
continue
# If no retrofit folder or it was empty, check files in survey_folder
summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path)
if not summary_data:
if len(survey_subfolders) == 1:
survey_folder_path = os.path.join(survey_folder_path, survey_subfolders[0])
summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path)
if summary_data:
summary_data = {
"survey_folder": survey_folder,
**summary_data,
}
extracted_data.append(summary_data)
retrofit_assessment_data = pd.DataFrame(extracted_data)
# TODO - Save this data
# if __name__ == "__main__":
# main()