diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index d9b5c41d..5c4da35b 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1,4 +1,6 @@ import os +from pyexpat import features + import PyPDF2 import re import pandas as pd @@ -1704,7 +1706,6 @@ def append_stonewater_id(): ) model_proposed_sample = model_proposed_sample[~pd.isnull(model_proposed_sample["Address ID"])] model_proposed_sample["Address ID"] = model_proposed_sample["Address ID"].astype(int) - z = model_proposed_sample["Archetype ID"].drop_duplicates().sort_values() original_archetypes = pd.read_excel( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 " @@ -2942,7 +2943,6 @@ def revised_model(): """ # 1) Create the new list of properties - new_priority_postcodes = pd.read_excel( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Jan 2025 Project/Updated 2025 to 2030 " "priority list.xlsx" @@ -3188,7 +3188,13 @@ def revised_model(): wates_coordination_sheet_abeyance ] ) - + # We correct the Asset ID for 34 Kempster Close + wates_coordination["Asset ID"] = np.where( + wates_coordination["Name"] == "34 Kempster Close", + "12005", + wates_coordination["Asset ID"] + ) + wates_coordination["folder_path"] = wates_coordination["Sharepoint Folder"].apply( lambda x: extract_sharepoint_url(x) ) @@ -3198,6 +3204,14 @@ def revised_model(): ############################################################ # NEW 450 COORDINATED RETROFIT ASSESSMENTS ############################################################# + features = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - " + "master sheet.csv", + encoding='latin1' + ) + features["Address ID"] = features["Address ID"].astype(str).astype(int) + features_to_merge = features[["Address ID", "Organisation Reference"]] + retrofit_packages_board = pd.read_excel( os.path.join( CUSTOMER_FOLDER_PATH, @@ -3211,6 +3225,10 @@ def revised_model(): retrofit_packages_board["RA"].isin(["Invoiced", "Completed"]) ] + retrofit_packages_board = retrofit_packages_board.merge( + features_to_merge, how="left", on="Address ID" + ) + manual_filters = { "Flat 21 Walmer Street": "StonewaterSurveys_14/91-1-Flat 21 Walmer Street-HR4 9JD", "6 Cornewall Close": "StonewaterSurveys_14/aa 6, Cornewall Close, Moccas, HEREFORD, HR2 9LG", @@ -3527,6 +3545,206 @@ def revised_model(): continue raise Exception("No match") + wates_matching_lookup = pd.DataFrame(wates_matching_lookup) + + # Merge lookup tables onto the coordination sheets + wates_coordination = wates_coordination.merge( + wates_matching_lookup, how="left", on="Name" + ) + missed_asset_id = wates_coordination[pd.isnull(wates_coordination["Asset ID_x"])] + if not missed_asset_id.empty: + # We fill the missing ids + missing_lookup = { + "4 Sydnall Fields": 31231, + "12 Sydnall Fields": 31239, + "12 Athena Gardens": 28061, + "49 Banner Lane": 41189, + "4 Jonathan Road": 41232, + "8 Jonathan Road": 41236, + "1 Jonathan Road": 41229, + "96 Taunton Way": 31417, + "94 Taunton Way": 31418, + "1 Lady Lane": 29430, + "10 Jonathan Road": 41283, + "21 Jonathan Road": 41246, + "12 Ashcroft Close": 26399 + } + for name, asset_id in missing_lookup.items(): + wates_coordination["Asset ID_x"] = np.where( + wates_coordination["Name"] == name, + asset_id, + wates_coordination["Asset ID_x"] + ) + + ccs_coordination = ccs_coordination.merge( + ccs_matching_lookup, how="left", on="Name" + ) + + retrofit_packages_board = retrofit_packages_board.merge( + matching_lookup, how="left", on="Name" + ) + + # We combine this into a singular board + coordinated_packages = pd.concat( + [ + retrofit_packages_board[ + [ + "Name", "Postcode", 'Actual SAP Band', 'Actual SAP Rating', + 'Modelled SAP Band', 'Modelled SAP Rating', 'Package Ref', + 'Main Wall Insulation', 'Secondary Wall Insulation', 'Loft insulation', + 'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade', + 'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls', + 'Solar PV', 'Other measures', 'Organisation Reference', + ] + ], + ccs_coordination[ + [ + # We don't have secondary wall insulation, Flat Roof, RIR, Heating Controls, + # Solar PV + "Name", "Postcode", 'SAP Band Pre', 'SAP Rating Pre', 'SAP Rating Install Package', + 'SAP Band Install Package', 'Package Approved (Client)', + 'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade', + 'Ventilation', 'Heating', 'Other Measures', "Asset ID.1_y", + ] + ].rename( + columns={ + "SAP Band Pre": "Actual SAP Band", + "SAP Rating Pre": "Actual SAP Rating", + 'SAP Rating Install Package': 'Modelled SAP Band', + 'SAP Band Install Package': 'Modelled SAP Rating', + 'Package Approved (Client)': 'Package Ref', + 'Wall Insulation': 'Main Wall Insulation', + 'Loft Insulation': 'Loft insulation', + 'Windows Upgrade': 'Window Upgrade', + 'Ext. Doors Upgrade': 'Door Upgrade', + 'Heating': 'Main Heating', + 'Other Measures': 'Other measures', + 'Asset ID.1_y': 'Organisation Reference', + } + ), + wates_coordination[ + [ + "Name", "Postcode", 'SAP Band Pre', 'SAP Rating Pre', 'SAP Rating Install Package', + 'SAP Band Install Package', 'Package Approved (Client)', + 'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade', + 'Ventilation', 'Heating', 'Other Measures', 'Asset ID_x' + + ] + ].rename( + columns={ + "SAP Band Pre": "Actual SAP Band", + "SAP Rating Pre": "Actual SAP Rating", + 'SAP Rating Install Package': 'Modelled SAP Band', + 'SAP Band Install Package': 'Modelled SAP Rating', + 'Package Approved (Client)': 'Package Ref', + 'Wall Insulation': 'Main Wall Insulation', + 'Loft Insulation': 'Loft insulation', + 'Windows Upgrade': 'Window Upgrade', + 'Ext. Doors Upgrade': 'Door Upgrade', + 'Heating': 'Main Heating', + 'Other Measures': 'Other measures', + 'Asset ID_x': 'Organisation Reference', + } + ) + ] + ) + + coordinated_packages["Organisation Reference"] = coordinated_packages["Organisation Reference"].astype(int) + + # Merge the property features on + coordinated_packages = coordinated_packages.merge( + features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type"]], + how="left", + on="Organisation Reference" + ) + + # We need the features pertaining to these priority postcodes + + def find_nearest_matching_property(coordinated_packages, home): + filter_levels = [ + ["Postcode", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], + ["Postal Region", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], + ["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], + ["Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], + ["Primary Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], + ["Primary Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], + ] + + for i, filters in enumerate(filter_levels): + match = coordinated_packages.copy() + + for col in filters: + match = match[match[col] == home[col]] + + if not match.empty: + return match + + return None # No match found + + coordinated_packages["Postal Region"] = coordinated_packages["Postcode"].str.split(" ").str[0].str.strip() + new_priority_postcodes["Postal Region"] = new_priority_postcodes["Postcode"].str.split(" ").str[0].str.strip() + + coordinated_packages["Roof Simple"] = coordinated_packages["Roofs"].str.split(":").str[0].str.strip() + new_priority_postcodes["Roof Simple"] = new_priority_postcodes["Roofs"].str.split(":").str[0].str.strip() + + coordinated_packages["Primary Property Type"] = coordinated_packages["Property Type"].str.split(":").str[0] + new_priority_postcodes["Primary Property Type"] = new_priority_postcodes["Property Type"].str.split(":").str[0] + + # For every property in the priority postcodes data, we look for a most appropriate matching property + no_match = [] + matches = [] + for _, home in tqdm(new_priority_postcodes.iterrows(), total=len(new_priority_postcodes)): + closest_match = find_nearest_matching_property(coordinated_packages, home) + if closest_match is None: + no_match.append(home["Organisation Reference"]) + continue + + to_extend = [ + { + "Organisation Reference": home["Organisation Reference"], + "Best Match Organisation Reference": m + } for m in closest_match["Organisation Reference"].values + ] + matches.extend(to_extend) + + no_match_summary = new_priority_postcodes[ + new_priority_postcodes["Organisation Reference"].isin( + no_match + ) + ].groupby(["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"])[ + "Organisation Reference"].count().reset_index() + + no_match_summary = no_match_summary.sort_values("Organisation Reference", ascending=False) + + # len(no_match) + # 8764, 5607 + # no_match_summary.shape + # (3953, 6), (2948, 6) + + # We match the properties to their closest match + + matches_df = pd.DataFrame(matches) + matches_df = matches_df.merge( + coordinated_packages[["Organisation Reference", "Actual SAP Band", "Actual SAP Rating"]], + left_on="Best Match Organisation Reference", right_on="Organisation Reference", + suffixes=("", " - Closest Match") + ) + # We want to aggregate the matches, when we have multiple + aggregated_matches_df = [] + for org_ref, mapped_matches in matches_df.groupby("Organisation Reference"): + if mapped_matches.shape[0] == 1: + mapped_matches["Number of matches"] = 1 + mapped_matches["Proportion"] + aggregated_matches_df.append(mapped_matches) + continue + + mapped_priority_list = new_priority_postcodes.merge( + matches_df, on="Organisation Reference", + ) + # We merge on the EPC ratings for the matched properties + mapped_priority_list = mapped_priority_list.merge( + + ) # if __name__ == "__main__": # main()