diff --git a/etl/customers/panacap_ventures/sample_remote_assessments.py b/etl/customers/panacap_ventures/sample_remote_assessments.py deleted file mode 100644 index 1a5ddff7..00000000 --- a/etl/customers/panacap_ventures/sample_remote_assessments.py +++ /dev/null @@ -1 +0,0 @@ -# The address we're looking from for the remote assessments is Natwest House, Shenley Rd, Borehamwood WD6 1DL diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index b1bf0638..105628e9 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -2928,14 +2928,122 @@ def revised_model(): original_archetypes = original_archetypes[original_archetypes["Address ID"] != "Address ID"] original_archetypes["Address ID"] = original_archetypes["Address ID"].astype(int) - original_archetypes = original_archetypes[ - ["Address ID", "Archetype ID", ""] - ] - # Check if we have all of the addresses missed = original_archetypes[ ~original_archetypes["Address ID"].isin(new_priority_postcodes["Address ID"].values) ]["Archetype ID"].unique() + assert set(missed) == {'NOT PRIORITY POSTCODE', 'IN WAVE 2.1', 'EPC C OR ABOVE'} + + original_archetypes = original_archetypes[ + ["Address ID", "Archetype ID", "Archetype Group Rank"] + ] + + # Merge these archetypes on to the new priority postcodes + new_priority_postcodes = new_priority_postcodes.merge( + original_archetypes, how="left", on="Address ID" + ) + + # Basic check, should have no rows with missing Archetype ID, where + assert float(new_priority_postcodes[pd.isnull(new_priority_postcodes["Archetype ID"])]["Address ID"].isin( + original_archetypes["Address ID"] + ).sum()) == 0 + + # We pull together the survey data sheet + survey_folders = [] + + # Loop over each survey folder and list its contents + for i in range(1, NUM_FOLDERS + 1): + folder_path = os.path.join(CUSTOMER_FOLDER_PATH, f"StonewaterSurveys_{i}") + if os.path.isdir(folder_path): # Check if folder exists + folder_contents = [os.path.join(f"StonewaterSurveys_{i}", file) for file in os.listdir(folder_path)] + survey_folders.extend(folder_contents) # Append contents to the master list + + wave_21_folders = [ + "1. Herefordshire", + "2. Bedfordshire", + "3. Wiltshire", + "4. Bournemouth", + "5. Coventry", + "6. West Sussex", + "7. Dorset", + "8. Cambridgeshire", + "9. Guildford", + "10. Little Island", + "11. CCS Dorset" + ] + + for wave_2_1_folder in wave_21_folders: + folder_path = os.path.join(CUSTOMER_FOLDER_PATH, "Wave 2.1 Surveys", wave_2_1_folder) + if os.path.isdir(folder_path): # Check if folder exists + folder_contents = [os.path.join("Wave 2.1 Surveys", wave_2_1_folder, file) for file in + os.listdir(folder_path)] + survey_folders.extend(folder_contents) # Append contents to the master list + + # We now do a large pull of all of the data + extracted_data = [] + for survey_folder in tqdm(survey_folders): + survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder) + + # List the folders inside of the survey folder + survey_subfolders = [ + name for name in os.listdir(survey_folder_path) + if os.path.isdir(os.path.join(survey_folder_path, name)) + ] + + # Check if there's a "retrofit assessment" folder + retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None) + + ra_folder = next( + (name for name in survey_subfolders if "ra coordinator info" in name.lower() or "ra info" in name.lower()), + None + ) + + # If retrofit assessment folder exists, check if it has content + if retrofit_folder or ra_folder: + if retrofit_folder: + retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder) + else: + retrofit_folder_path = os.path.join(survey_folder_path, ra_folder) + + # Check if everything inside is a sub-folder and the number of folders is 2 + items = [item for item in os.listdir(retrofit_folder_path) if item != '.DS_Store'] + all_folders = [os.path.isdir(os.path.join(retrofit_folder_path, item)) for item in items] + if all(all_folders) and len(all_folders) == 2 and "Property Pics" in items: + # Get the folder that isn't Property Pics + retrofit_folder_path = os.path.join( + retrofit_folder_path, [item for item in items if item != "Property Pics"][0] + ) + + if os.listdir(retrofit_folder_path): # If not empty + summary_data = extract_retrofit_pdfs(data_folder_path=retrofit_folder_path) + if summary_data: + summary_data = { + "survey_folder": survey_folder, + **summary_data, + } + extracted_data.append(summary_data) + continue + else: + # Then we have an empty Retrofit Assessment folder + continue + + # If no retrofit folder or it was empty, check files in survey_folder + summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path) + if not summary_data: + if len(survey_subfolders) == 1: + survey_folder_path = os.path.join(survey_folder_path, survey_subfolders[0]) + summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path) + + if summary_data: + summary_data = { + "survey_folder": survey_folder, + **summary_data, + } + extracted_data.append(summary_data) + + retrofit_assessment_data = pd.DataFrame(extracted_data) + # TODO - Save this data + # if __name__ == "__main__": # main()