diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index c8e61a0e..426097e8 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1669,7 +1669,7 @@ def propsed_wave_3_sample(): header=4 ) - # TODO: We drop 302 properties that are not priority postcodes - confirm w/ Stonewater and 7 properties missing + # TODO: We drop 7 properties missing # UPRN asset_list = asset_list[~asset_list["Archetype ID"].isin(["MISSING UPRN"])] # Clean address ids @@ -1699,15 +1699,23 @@ def propsed_wave_3_sample(): os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - costed retrofit packages V3.xlsx"), header=0 ) - survey_results = survey_results.merge( + + survey_results = survey_results.drop( + columns=["Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness"] + ).merge( additional_survey_data[ [ "Address ID", "Main Wall Type", "Main Wall Insulation_x", "Main Wall Thickness", "Main Building Alternative Wall Type", "Main Building Alternative Wall Insulation", - "Main Building Alternative Wall Thickness" + "Main Building Alternative Wall Thickness", + "Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness" ] - ].rename(columns={"Main Wall Insulation_x": "Main Wall Insulation Type"}), + ].rename( + columns={ + "Main Wall Insulation_x": "Main Wall Insulation Type", + } + ), how="left", on="Address ID" ) @@ -1718,6 +1726,7 @@ def propsed_wave_3_sample(): "Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Postcode", "Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness", "Existing Primary Heating System", + "Package Ref", "Main Wall Type", "Main Wall Insulation Type", "Main Wall Thickness", "Main Building Alternative Wall Type", "Main Building Alternative Wall Insulation", "Main Building Alternative Wall Thickness" @@ -1727,6 +1736,7 @@ def propsed_wave_3_sample(): "Existing Primary Heating System": "Survey: Primary Heating System" } ) + survey_results["Postal Region"] = survey_results["Postcode"].str.split(" ").str[0] # Concatenate from the wall information survey_results["Survey: Main Wall Type"] = survey_results["Main Wall Type"].astype(str) + ": " + survey_results[ @@ -1929,7 +1939,7 @@ def propsed_wave_3_sample(): region_assets = region_assets.merge( exact_surveyed[ ["Address ID", "Current EPC Band", "Current SAP Rating"] + survey_attribute_columns + [ - "Survey: Matching Address ID" + "Survey: Matching Address ID", "Package Ref" ] ], on="Address ID", @@ -2005,6 +2015,7 @@ def propsed_wave_3_sample(): 'Survey: Primary Heating System': closest_match["Survey: Primary Heating System"], "Survey: Matching Address ID": closest_match["Address ID"], 'Distance to Closest Match (m)': closest_match["distance_meters"], + "Package Ref": closest_match["Package Ref"], "Match Type": match_type } ) @@ -2015,7 +2026,8 @@ def propsed_wave_3_sample(): columns=[ "Archetype ID", "Address ID", "Current EPC Band", "Current SAP Rating", 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', 'Survey: Main Roof Type', - 'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)' + 'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)', + "Match Type" ] ) @@ -2032,8 +2044,8 @@ def propsed_wave_3_sample(): # Label the tier 1 properties region_assets["Confidence Tier"] = np.where( region_assets["Current EPC Band_method1"].isin(["D", "E", "F", "G"]) & - pd.isnull(region_assets["Confidence Tier"]), - "1 - Archetype surveyed in region", region_assets["Confidence Tier"] + pd.isnull(region_assets["Confidence Tier"]) & ~pd.isnull(region_assets["Match Type"]), + region_assets["Match Type"], region_assets["Confidence Tier"] ) # Handle EPC C @@ -2046,86 +2058,7 @@ def propsed_wave_3_sample(): region_assets = fill_survey_columns(region_assets, suffix="_method1") method_1_columns = [c for c in region_assets.columns if c.endswith("_method1")] - region_assets = region_assets.drop(columns=method_1_columns) - - missed_archetypes = set(archetype_ids) - set(region_surveyed["Archetype ID"]) - - # archetype_surveyed = [] - for arch_id in missed_archetypes: - for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows(): - archetype_data = survey_results_with_original_features[ - survey_results["Archetype ID"] == arch_id - ].copy() - if archetype_data.empty: - continue - raise Exception("IMPLEMENT ME") - # archetype_data["distance_meters"] = haversine( - # lat1=property.latitude, lon1=property.longitude, - # lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values - # ) - # expected_sap = np.average( - # archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1) - # ) - # expected_epc = sap_to_epc(expected_sap) - # archetype_surveyed.append( - # { - # "Archetype ID": arch_id, - # "Address ID": property["Address ID"], - # "Current EPC Band": expected_epc - # } - # ) - # archetype_surveyed = pd.DataFrame(archetype_surveyed) - # if archetype_surveyed.empty: - # archetype_surveyed = pd.DataFrame( - # columns=[ - # "Archetype ID", "Address ID", "Current EPC Band", "Current SAP Rating", - # 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', 'Survey: Main Roof Type', - # 'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)' - # ] - # ) - # - # region_assets = region_assets.merge( - # archetype_surveyed, - # on=["Archetype ID", "Address ID"], - # how="left", - # suffixes=("", "_method2") - # ) - # - # region_assets["Confidence Tier"] = np.where( - # region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]) & pd.isnull( - # region_assets["Confidence Tier"]), - # "2 - same archetype", region_assets["Confidence Tier"] - # ) - # - # for col in [ - # 'Current EPC Band', 'Current SAP Rating', - # 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', - # 'Survey: Main Roof Type', 'Survey: Primary Heating System', - # 'Survey: Matching Address ID', 'Distance to Closest Match (m)' - # ]: - # region_assets[col] = np.where( - # pd.isnull(region_assets[col]) & pd.notnull(region_assets[col + "_method2"]), - # region_assets[col + "_method2"], region_assets[col] - # ) - # - # method_2_columns = [c for c in region_assets.columns if c.endswith("_method2")] - # region_assets = region_assets.drop(columns=method_2_columns) - - # We label EPC C properties - # region_assets["Confidence Tier"] = np.where( - # region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]), - # "5 - EPC C or above", region_assets["Confidence Tier"] - # ) - # - # region_assets["Confidence Tier"] = np.where( - # region_assets["Archetype ID"] == "EPC C OR ABOVE", - # "5 - EPC C or above", region_assets["Confidence Tier"] - # ) - # - # region_assets["Current EPC Band"] = np.where( - # region_assets["Archetype ID"] == "EPC C OR ABOVE", - # "C", region_assets["Current EPC Band"] - # ) + region_assets = region_assets.drop(columns=method_1_columns + ["Match Type"]) missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist() @@ -2217,6 +2150,7 @@ def propsed_wave_3_sample(): "Survey: Primary Heating System": "Not Surveyed", "Survey: Matching Address ID": "Not Surveyed", 'Distance to Closest Match (m)': 9999999, + "Package Ref": "Not Surveyed", } ) continue @@ -2261,6 +2195,7 @@ def propsed_wave_3_sample(): "Survey: Primary Heating System": closest_match["Survey: Primary Heating System"], "Survey: Matching Address ID": closest_match["Address ID"], 'Distance to Closest Match (m)': closest_match["distance_meters"], + "Package Ref": closest_match["Package Ref"] } ) continue @@ -2292,8 +2227,10 @@ def propsed_wave_3_sample(): # Check if there are missings in current epc band, current sap rating or any of the survey attributes for c in ( - ["Current EPC Band", "Current SAP Rating", "Survey: Matching Address ID", 'Distance to Closest Match (m)'] + - survey_attribute_columns): + [ + "Current EPC Band", "Current SAP Rating", "Survey: Matching Address ID", 'Distance to Closest Match (m)'] + + survey_attribute_columns + ): if pd.isnull(results[c]).sum(): raise Exception("Something went wrong") @@ -2382,5 +2319,76 @@ def propsed_wave_3_sample(): total_bid_size = bid_size + no_loss_postcodes["Gain"].sum() print(total_bid_size) + # Label final outputs + # We create a summary of packages by street + results["Package Ref"] = results["Package Ref"].fillna("Incomplete") + results["Package Ref"] = results["Package Ref"].astype(str) + package_summary = results.pivot_table( + index='Street and Region', + columns='Package Ref', + aggfunc='size', + fill_value=0 + ).reset_index() + + street_bid_structure = street_summary.merge( + package_summary, how="left", on="Street and Region" + ) + street_bid_structure = street_bid_structure.sort_values("Gain", ascending=False) + street_bid_structure.to_csv( + os.path.join(CUSTOMER_FOLDER_PATH, "Street Bid Structure.csv"), index=False + ) + + individual_units_programme = results.copy() + individual_units_programme["Unit in Programme"] = individual_units_programme["Street and Region"].isin( + street_bid_structure[street_bid_structure["Selected"]]["Street and Region"].values + ) + + # Merge on Stonewaters ID + asset_list_ids = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 " + "- Archetyped V3.1.xlsx", + header=4 + )[["Address ID", "Org. ref."]] + # Clean address ids + asset_list_ids = asset_list_ids[~pd.isnull(asset_list_ids["Address ID"])] + asset_list_ids = asset_list_ids[asset_list_ids["Address ID"] != "Address ID"] + asset_list_ids["Address ID"] = asset_list_ids["Address ID"].astype(int) + individual_units_programme = individual_units_programme.merge( + asset_list_ids, + how="left", + on="Address ID", + ) + + individual_units_programme = individual_units_programme.merge( + asset_list_ids.rename( + columns={"Org. ref.": "Survey: Org. ref.", "Address ID": "Survey: Matching Address ID"} + ), + how="left", + on="Survey: Matching Address ID" + ) + + individual_units_programme["Survey: Org. ref."] = np.where( + (individual_units_programme["Survey: Matching Address ID"] == "Not Surveyed"), + "Not Surveyed", + individual_units_programme["Survey: Org. ref."] + ) + + if pd.isnull(individual_units_programme["Survey: Org. ref."]).sum() or pd.isnull( + individual_units_programme["Org. ref."]).sum(): + raise ValueError("something went wrong") + + for col in ["Survey: Main Roof Type", "Survey: Main Wall Type", "Survey: Main Alternative Wall"]: + individual_units_programme[col] = ( + individual_units_programme[col] + .str.replace(r': nan(?=$|:)', '', regex=True) # Remove ': nan' at the end or before another ':' + .str.replace(r':\s+:', ': ', regex=True) # Replace occurrences of ': :' with ': ' + .str.replace(r'\s+', ' ', regex=True) # Replace multiple spaces with a single space + .str.strip() # Strip leading/trailing spaces + ) + + individual_units_programme.to_csv( + os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme.csv"), index=False + ) + # if __name__ == "__main__": # main()