diff --git a/.idea/Model.iml b/.idea/Model.iml index 762580d9..df6c4faa 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index c916a158..50cad4ca 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py index f32dcea6..70ceb76d 100644 --- a/etl/customers/remote_assessments/app.py +++ b/etl/customers/remote_assessments/app.py @@ -4,7 +4,7 @@ from dotenv import load_dotenv from utils.s3 import save_csv_to_s3 from etl.find_my_epc.AssetListEpcData import AssetListEpcData -PORTFOLIO_ID = 127 +PORTFOLIO_ID = 128 USER_ID = 8 load_dotenv(dotenv_path="backend/.env") @@ -19,9 +19,9 @@ def app(): asset_list = [ { - "address": "19 Hillcrest Court", - "postcode": "IP21 4YJ", - "uprn": 2630134524, + "address": "46", + "postcode": "BS6 7BD", + "uprn": 61091, } ] asset_list = pd.DataFrame(asset_list) @@ -52,8 +52,8 @@ def app(): valuation_data = [ { - "uprn": 2630134524, - "valuation": 96_000 + "uprn": 61091, + "valuation": 897_000 } ] # Store valuation data to s3 diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 12158671..94904aae 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3028,11 +3028,12 @@ def revised_model(): "10. Little Island", "11. CCS Dorset" ] + wave_21_folder_name = "Wave 2.1 Surveys - 2" for wave_2_1_folder in wave_21_folders: - folder_path = os.path.join(CUSTOMER_FOLDER_PATH, "Wave 2.1 Surveys", wave_2_1_folder) + folder_path = os.path.join(CUSTOMER_FOLDER_PATH, wave_21_folder_name, wave_2_1_folder) if os.path.isdir(folder_path): # Check if folder exists - folder_contents = [os.path.join("Wave 2.1 Surveys", wave_2_1_folder, file) for file in + folder_contents = [os.path.join(wave_21_folder_name, wave_2_1_folder, file) for file in os.listdir(folder_path)] survey_folders.extend(folder_contents) # Append contents to the master list @@ -3179,18 +3180,32 @@ def revised_model(): # Save # retrofit_assessment_data.to_csv( - # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 3.csv"), index=False + # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 5.csv"), index=False # ) # mtp_df.to_csv( - # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 3.csv"), index=False + # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 5.csv"), index=False # ) retrofit_assessment_data = pd.read_csv( - os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 3.csv"), + os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 5.csv"), ) mtp_df = pd.read_csv( - os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 3.csv"), + os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 5.csv"), ) + # There are a few duplicates we just manually drop + mtp_df = mtp_df.drop_duplicates() + mtp_df = mtp_df[ + ~(( + mtp_df["survey_folder"] == "Wave 2.1 Surveys - 2/1. Herefordshire/(043) Manor Fields 27" + ) & (~mtp_df["has_pv"])) + ] + + mtp_df = mtp_df[ + ~(( + mtp_df["survey_folder"] == "Wave 2.1 Surveys - 2/2. Bedfordshire/(147) Gilpin Close 5" + ) & (~mtp_df["has_pv"])) + ] + # Remove some definite duplicates dupes = retrofit_assessment_data[retrofit_assessment_data["Address"].duplicated()]["Address"] dupes = retrofit_assessment_data[retrofit_assessment_data["Address"].isin(dupes)] @@ -3487,7 +3502,7 @@ def revised_model(): ccs_coordination = ccs_coordination[ccs_coordination["Retrofit Assessment"] != "Outstanding"] ccs_manual_filters = { - "35 Kittiwake Close": "Wave 2.1 Surveys/11. CCS Dorset/Kittiwake Close 35" + "35 Kittiwake Close": f"{wave_21_folder_name}/11. CCS Dorset/Kittiwake Close 35" } ccs_matching_lookup = [] for _, home in tqdm(ccs_coordination.iterrows(), total=len(ccs_coordination)): @@ -3583,13 +3598,13 @@ def revised_model(): ] wates_manual_filters = { - "24 Rabley Wood View": "Wave 2.1 Surveys/3. Wiltshire/24-25 Rabley Wood View", - "14 Edencroft": "Wave 2.1 Surveys/3. Wiltshire/14 Edencroft", - "Flat 31 Rabley Wood View": "Wave 2.1 Surveys/3. Wiltshire/Flat 31 Rabley Wood View", - 'Flat 13, Manor Fields': 'Wave 2.1 Surveys/1. Herefordshire/(038) Manor Fields Flat 13', - "4 Kittys Lane": "Wave 2.1 Surveys/1. Herefordshire/(005) Kittys Lane 4", - '1 Jephson Court': 'Wave 2.1 Surveys/5. Coventry/Jesphson Court 1', - '2 Jephson Court': 'Wave 2.1 Surveys/5. Coventry/Jesphson Court 2', + "24 Rabley Wood View": f"{wave_21_folder_name}/3. Wiltshire/24-25 Rabley Wood View", + "14 Edencroft": f"{wave_21_folder_name}/3. Wiltshire/14 Edencroft", + "Flat 31 Rabley Wood View": f"{wave_21_folder_name}/3. Wiltshire/Flat 31 Rabley Wood View", + 'Flat 13, Manor Fields': f'{wave_21_folder_name}/1. Herefordshire/(038) Manor Fields Flat 13', + "4 Kittys Lane": f"{wave_21_folder_name}/1. Herefordshire/(005) Kittys Lane 4", + '1 Jephson Court': f'{wave_21_folder_name}/5. Coventry/Jesphson Court 1', + '2 Jephson Court': f'{wave_21_folder_name}/5. Coventry/Jesphson Court 2', } wates_matching_lookup = [] # Examples to skip when we cannot get the data @@ -3720,6 +3735,9 @@ def revised_model(): if not missed_asset_id.empty: raise Exception("Missing Asset ID") + if wates_coordination["Asset ID_x"].duplicated().sum(): + raise Exception("Duplicated IDs in wates") + # We merge the mpt data on to the wates coordination wates_coordination = wates_coordination.merge( mtp_df, how="left", on="survey_folder" @@ -3839,29 +3857,31 @@ def revised_model(): def find_nearest_matching_property(coordinated_packages, home): filter_levels = [ - ["Postcode", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], - ["Postal Region", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], - ["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], - ["Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], - ["Primary Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], - ["Primary Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], + (["Postcode", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 1), + (["Postal Region", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 2), + (["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 3), + (["Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 4), + (["Primary Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 5), + (["Primary Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 6), ] - for i, filters in enumerate(filter_levels): + max_confidence = max([confidence for (_, confidence) in filter_levels]) + + for i, (filters, match_confidence) in enumerate(filter_levels): match = coordinated_packages.copy() for col in filters: match = match[match[col] == home[col]] if not match.empty: - return match + return match, match_confidence # Finally, we search for a property in the same Archetype match = coordinated_packages[coordinated_packages["Archetype ID"] == home["Archetype ID"]] if not match.empty: - return match + return match, max_confidence + 1 - return None # No match found + return None, None # No match found coordinated_packages["Postal Region"] = coordinated_packages["Postcode"].str.split(" ").str[0].str.strip() new_priority_postcodes["Postal Region"] = new_priority_postcodes["Postcode"].str.split(" ").str[0].str.strip() @@ -3896,8 +3916,8 @@ def revised_model(): ] matches.extend(to_extend) continue - - closest_match = find_nearest_matching_property(coordinated_packages, home) + blah + closest_match, match_confidence = find_nearest_matching_property(coordinated_packages, home) if closest_match is None: no_match.append(home["Organisation Reference"]) continue diff --git a/etl/customers/stonewater/data_cleaning.py b/etl/customers/stonewater/data_cleaning.py index 010902ce..a5da0c79 100644 --- a/etl/customers/stonewater/data_cleaning.py +++ b/etl/customers/stonewater/data_cleaning.py @@ -86,8 +86,14 @@ def download_data_from_sharepoint(): folder_path="Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders" ) + folders_to_keep = [ + "1. Herefordshire", "2. Bedfordshire", "3. Wiltshire", "4. Bournemouth", + "5. Coventry", "6. West Sussex", "7. Dorset", "8. Cambridgeshire", + "9. Guildford", "10. Little Island", "11. CCS Dorset", + ] + folders_to_pull = [ - folder for folder in contents["value"] if folder["name"] in ["3. Wiltshire", "4. Bournemouth", "5. Coventry"] + folder for folder in contents["value"] if folder["name"] in folders_to_keep ] for folder_to_pull in folders_to_pull: # Get the contents @@ -109,35 +115,40 @@ def download_data_from_sharepoint(): ) if not property_folder_contents.get("value"): continue - # We look for the retrofit assessment folder: + # We look for the retrofit assessment folder or mtp folders: property_sub_folders = [ - f for f in property_folder_contents["value"] if "ra coordinator info" in f["name"].lower() + f for f in property_folder_contents["value"] if + "ra coordinator info" in f["name"].lower() or + "retrofit assessment" in f["name"].lower() or + "ra info" in f["name"].lower() or + "mtp" in f["name"].lower() or + "mid-term" in f["name"].lower() ] if not property_sub_folders: continue - # if we have this, we download the folder and store it on my laptop! - property_sub_folder = property_sub_folders[0] + for property_sub_folder in property_sub_folders: + # if we have this, we download the folder and store it on my laptop! - property_folder_path = os.path.join( - "Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders", - folder_to_pull["name"], - property_folder["name"], - property_sub_folder["name"] - ) + property_folder_path = os.path.join( + "Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders", + folder_to_pull["name"], + property_folder["name"], + property_sub_folder["name"] + ) - download_dir = os.path.join( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 2.1 Surveys", - folder_to_pull["name"], - property_folder["name"], - property_sub_folder["name"] - ) + download_dir = os.path.join( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 2.1 Surveys - 2", + folder_to_pull["name"], + property_folder["name"], + property_sub_folder["name"] + ) - # We download the folder - sharepoint_client.download_sharepoint_folder( - drive_id=sharepoint_client.document_drive["id"], - folder_path=property_folder_path, - download_dir=download_dir, - excluded_file_types=["MOV", "jpg"] - ) + # We download the folder + sharepoint_client.download_sharepoint_folder( + drive_id=sharepoint_client.document_drive["id"], + folder_path=property_folder_path, + download_dir=download_dir, + excluded_file_types=["MOV", "jpg"] + )