From bd131a2f663056fb46a906d8f148b2bcc06cd871 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 13 Feb 2025 22:32:31 +0000 Subject: [PATCH] preparing outputs for stonewater --- .../stonewater/Wave 3 Preparation.py | 77 +++++++++++++++---- 1 file changed, 62 insertions(+), 15 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 94904aae..50dadcaf 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -2984,6 +2984,8 @@ def revised_model(): original_archetypes["Address ID"] = original_archetypes["Address ID"].astype(int) original_archetypes["UPRN"] = original_archetypes["UPRN"].astype("Int64").astype(str) + wave_21_folder_name = "Wave 2.1 Surveys - 2" + # Check if we have all of the addresses missed = original_archetypes[ ~original_archetypes["Address ID"].isin(new_priority_postcodes["Address ID"].values) @@ -3028,7 +3030,6 @@ def revised_model(): "10. Little Island", "11. CCS Dorset" ] - wave_21_folder_name = "Wave 2.1 Surveys - 2" for wave_2_1_folder in wave_21_folders: folder_path = os.path.join(CUSTOMER_FOLDER_PATH, wave_21_folder_name, wave_2_1_folder) @@ -3252,7 +3253,9 @@ def revised_model(): 'Main Wall Thickness', 'Main Building Alternative Wall Type', 'Main Building Alternative Wall Insulation', 'Main Building Alternative Wall Dry-lining', - 'Main Building Alternative Wall Thickness', 'Main Fuel' + 'Main Building Alternative Wall Thickness', + 'Main Fuel', + 'Main Building Age Band', ] # For the columns in retrofit_assessments_data_columns, prefix all of them with Survey: retrofit_assessments_data_columns_prefixed = ["Survey: " + x for x in retrofit_assessments_data_columns] @@ -3795,7 +3798,8 @@ def revised_model(): "Name", "Postcode", 'SAP Band Pre', 'SAP Rating Pre', 'SAP Rating Install Package', 'SAP Band Install Package', 'Package Approved (Client)', 'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade', - 'Ventilation', 'Heating', 'Other Measures', "Asset ID.1_y", + 'Ventilation', 'Heating', 'Other Measures', 'PV System', + "Asset ID.1_y", ] + retrofit_assessments_data_columns_prefixed ].rename( columns={ @@ -3811,6 +3815,7 @@ def revised_model(): 'Heating': 'Main Heating', 'Other Measures': 'Other measures', 'Asset ID.1_y': 'Organisation Reference', + "PV System": "Solar PV", } ), wates_coordination[ @@ -3818,8 +3823,7 @@ def revised_model(): "Name", "Postcode", 'SAP Band Pre', 'SAP Rating Pre', 'SAP Rating Install Package', 'SAP Band Install Package', 'Package Approved (Client)', 'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade', - 'Ventilation', 'Heating', 'Other Measures', 'Asset ID_x' - + 'Ventilation', 'Heating', 'Other Measures', 'Asset ID_x', "PV System" ] + retrofit_assessments_data_columns_prefixed ].rename( columns={ @@ -3835,6 +3839,7 @@ def revised_model(): 'Heating': 'Main Heating', 'Other Measures': 'Other measures', 'Asset ID_x': 'Organisation Reference', + "PV System": "Solar PV", } ) ] @@ -3857,12 +3862,12 @@ def revised_model(): def find_nearest_matching_property(coordinated_packages, home): filter_levels = [ - (["Postcode", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 1), - (["Postal Region", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 2), - (["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 3), - (["Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 4), - (["Primary Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 5), - (["Primary Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 6), + (["Postcode", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 2), + (["Postal Region", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 3), + (["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 4), + (["Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 5), + (["Primary Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 6), + (["Primary Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 7), ] max_confidence = max([confidence for (_, confidence) in filter_levels]) @@ -3911,12 +3916,13 @@ def revised_model(): { "Organisation Reference": home["Organisation Reference"], "Best Match Organisation Reference": m, + "match_confidence": 1, "Was Surveyed": True } for m in survey_result["Organisation Reference"].values ] matches.extend(to_extend) continue - blah + closest_match, match_confidence = find_nearest_matching_property(coordinated_packages, home) if closest_match is None: no_match.append(home["Organisation Reference"]) @@ -3926,6 +3932,7 @@ def revised_model(): { "Organisation Reference": home["Organisation Reference"], "Best Match Organisation Reference": m, + "match_confidence": match_confidence, "Was Surveyed": False } for m in closest_match["Organisation Reference"].values ] @@ -3953,10 +3960,29 @@ def revised_model(): suffixes=("", " - Closest Match") ) + measures_columns = [ + 'Main Wall Insulation', 'Secondary Wall Insulation', 'Loft insulation', + 'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade', + 'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls', + 'Solar PV', 'Other measures' + ] + # We want to aggregate the matches, when we have multiple aggregated_matches_df = [] for org_ref, mapped_matches in matches_df.groupby("Organisation Reference"): + + measures = coordinated_packages[ + ( + coordinated_packages["Organisation Reference"].isin( + mapped_matches['Best Match Organisation Reference'].values + ) + ) + ][measures_columns] + if mapped_matches.shape[0] == 1: + # Get the measures for this property + measures = measures.squeeze() + aggregated_matches_df.append( { "Organisation Reference": org_ref, @@ -3965,6 +3991,7 @@ def revised_model(): "Estimated SAP Rating": mapped_matches["Survey: Current SAP Rating"].values[0], "Estimated EPC Rating": mapped_matches["Survey: Current EPC Band"].values[0], "Was Surveyed": mapped_matches["Was Surveyed"].values[0], + **measures } ) continue @@ -3978,6 +4005,17 @@ def revised_model(): mapped_matches[mapped_matches["Survey: Current EPC Band"] == average_epc_rating].shape[ 0] / number_of_matches * 100 ) + + measures_aggregated = {} + for m in measures_columns: + if any(~pd.isnull(measures[m])): + # Check if we have 2 unique values + vals = measures[~pd.isnull(measures[m])][m].unique() + if len(vals) > 1: + measures_aggregated[m] = ", ".join(vals) + else: + measures_aggregated[m] = vals[0] + aggregated_matches_df.append( { "Organisation Reference": org_ref, @@ -3985,7 +4023,8 @@ def revised_model(): "Proportion": proportion_with_this_epc, "Estimated SAP Rating": average_rating, "Estimated EPC Rating": average_epc_rating, - "Was Surveyed": False + "Was Surveyed": False, + **measures_aggregated } ) @@ -4002,7 +4041,6 @@ def revised_model(): def remove_leading_zero(address): return re.sub(r"^0([1-9]) ", r"\1 ", address) - # Example usage mapped_priority_list["address1"] = mapped_priority_list["address1"].apply(remove_leading_zero) mapped_priority_list["address1"] = np.where( mapped_priority_list["Organisation Reference"] == 37004, @@ -4020,6 +4058,13 @@ def revised_model(): ) mapped_priority_list["row_id"] = mapped_priority_list["Organisation Reference"] + # Flag where 2 out of the three columns have consensus + mapped_priority_list["2 of 3 Data Sources Have Consensus on EPC"] = ( + (mapped_priority_list["SAP Band"] == mapped_priority_list["EPC Band"]) | + (mapped_priority_list["SAP Band"] == mapped_priority_list["Estimated EPC Rating"]) | + (mapped_priority_list["EPC Band"] == mapped_priority_list["Estimated EPC Rating"]) + ) + # Let's get the newest EPC data for these properties # We merge on UPRN, when we have it # from etl.route_march_data_pull.app import get_data @@ -4081,6 +4126,7 @@ def revised_model(): 'Survey: Main Building Alternative Wall Dry-lining', 'Survey: Main Building Alternative Wall Thickness', 'Survey: Main Fuel', + 'Survey: Main Building Age Band', 'Walls', 'Roofs', 'Heating', 'Main Fuel', 'Age', 'Property Type' ] ].rename( @@ -4133,7 +4179,8 @@ def revised_model(): [ "Organisation Reference", 'Survey: Main Wall Type', 'Survey: Main Wall Insulation', 'Survey: Main Roof Type', 'Survey: Main Roof Insulation', 'Survey: Main Roof Insulation Thickness', - 'Survey: Existing Primary Heating System', + 'Survey: Existing Primary Heating System', 'Survey: Main Building Age Band', + 'Survey: Main Building Wall Area (m2)', ] ].rename( columns={