From 1645f9ab9ed84bdb90fa2a732d697111b36bd17b Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 19 Nov 2024 22:00:00 +0000 Subject: [PATCH] updating stonewater modelling code to use new data --- .../stonewater/Wave 3 Preparation.py | 288 +++++++++++++++--- 1 file changed, 247 insertions(+), 41 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 426097e8..f4195592 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1071,10 +1071,13 @@ def main(): ] # We now merge on the coordinator data so that against each property, we can map the measures + # TODO: Get the pre & post primary energy numbers + # TODO: Make sure the numbers are going down + retrofit_packages_board = pd.read_excel( os.path.join( CUSTOMER_FOLDER_PATH, - "Stonewater_SHDF_3_0_Board_work_in_progress_-_Operations_1731315080 11.11.24.xlsx" + "Stonewater_SHDF_3_0_Board_work_in_progress_-_Operations_1732034933 Final 19.11.24.xlsx" ), header=4 ) @@ -1084,6 +1087,18 @@ def main(): retrofit_packages_board["RA"].isin(["Invoiced", "Completed"]) ] + # populated_primary_energy = retrofit_packages_board[ + # ~pd.isnull(retrofit_packages_board['BASE Primary energy (13a-272)']) + # ] + # + # z = populated_primary_energy[ + # populated_primary_energy['POST Primary energy (13a - 272)'] > populated_primary_energy[ + # 'BASE Primary energy (13a-272)'] + # ] + # + # all(populated_primary_energy['POST Primary energy (13a - 272)'] <= populated_primary_energy[ + # 'BASE Primary energy (13a-272)']) + # Replace \n with "" extracted_data["Postcode"] = extracted_data["Postcode"].str.replace("\n", "") @@ -1192,7 +1207,7 @@ def main(): # missed[["Name", "Postcode", "Archetype ID", "Arch. Group Rank"]].to_csv( # CUSTOMER_FOLDER_PATH + "/missed_debugging.csv") - if len(missing_ids) != 6: + if len(missing_ids) != 1: raise Exception("Unacceptable number of missings") if matching_lookup["Address ID"].duplicated().sum(): @@ -1239,7 +1254,6 @@ def main(): if stonewater_data["Address ID"].duplicated().sum(): raise Exception("Duplicate Address IDs") - # Create a section for costs for measure in measure_columns: stonewater_data[f"Cost of {measure}"] = None @@ -1297,8 +1311,41 @@ def main(): ]: stonewater_data[c] = stonewater_data[c].astype(str) + # FIll the primary energy numbers from the excel + stonewater_data = stonewater_data.merge( + retrofit_packages_board[ + [ + "Name", "Address ID", "BASE Primary energy (13a-272)", "POST Primary energy (13a - 272)" + ] + ], + on=["Address ID", "Name"], + how="left" + ) + stonewater_data["Primary Energy Use (kWh/yr)"] = np.where( + pd.isnull(stonewater_data["Primary Energy Use (kWh/yr)"]), + stonewater_data["BASE Primary energy (13a-272)"], + stonewater_data["Primary Energy Use (kWh/yr)"] + ) + stonewater_data = stonewater_data.drop(columns=["BASE Primary energy (13a-272)"]) + + # Add on organisation reference + original_archetypes = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 " + "- Archetyped V3.1.xlsx", + header=4 + ) + original_archetypes = original_archetypes[~pd.isnull(original_archetypes["Address ID"])] + original_archetypes = original_archetypes[original_archetypes["Address ID"] != "Address ID"] + original_archetypes["Address ID"] = original_archetypes["Address ID"].astype(int) + + stonewater_data = stonewater_data.merge( + original_archetypes[["Address ID", 'Org. ref.']], + on="Address ID", + how="left" + ) + # Save this data to excel - stonewater_data.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages V3.xlsx", index=False) + stonewater_data.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages V4.xlsx", index=False) cost_sheet = [ { @@ -1677,6 +1724,12 @@ def propsed_wave_3_sample(): asset_list = asset_list[asset_list["Address ID"] != "Address ID"] asset_list["Address ID"] = asset_list["Address ID"].astype(int) + asset_list["Street name"] = np.where( + pd.isnull(asset_list["Street name"]), + asset_list["Postcode"], + asset_list["Street name"] + ) + # Create the postal region, taking the first part of the postcode asset_list["Postal Region"] = asset_list["Postcode"].str.split(" ").str[0] asset_list["Street and Region"] = asset_list["Street name"] + " " + asset_list["Postal Region"] @@ -1684,43 +1737,16 @@ def propsed_wave_3_sample(): # Keep just the columns we need asset_list = asset_list[ - ["UPRN", "Address ID", "Archetype ID", "Postal Region", "Postcode", "Street and Region", + ["UPRN", "Address ID", 'Org. ref.', "Archetype ID", "Postal Region", "Name", "Postcode", "Street and Region", "Property Type", "Wall Type", "Roof Type", "Heating"] ] - # Updated packages: to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages V3.xlsx", index=False) survey_results = pd.read_excel( - os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.24.xlsx"), + os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.19 V2.xlsx"), header=13, sheet_name="Modelled Packages" ) - additional_survey_data = pd.read_excel( - os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - costed retrofit packages V3.xlsx"), - header=0 - ) - - survey_results = survey_results.drop( - columns=["Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness"] - ).merge( - additional_survey_data[ - [ - "Address ID", - "Main Wall Type", "Main Wall Insulation_x", "Main Wall Thickness", - "Main Building Alternative Wall Type", "Main Building Alternative Wall Insulation", - "Main Building Alternative Wall Thickness", - "Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness" - ] - ].rename( - columns={ - "Main Wall Insulation_x": "Main Wall Insulation Type", - } - ), - how="left", - on="Address ID" - ) - - # TOOD: We probably want the actual surveyed wall, roof, heating type survey_results = survey_results[ [ "Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Postcode", @@ -1768,6 +1794,105 @@ def propsed_wave_3_sample(): if survey_results_with_original_features.shape[0] != survey_results.shape[0]: raise ValueError("Something went wrong") + # Against properties that have NO package ref, we assign a package ref + properties_with_packages = survey_results_with_original_features[ + ~pd.isnull(survey_results_with_original_features["Package Ref"]) + ] + + properties_without_packages = survey_results_with_original_features[ + (survey_results_with_original_features["Current SAP Rating"] < 69) & pd.isnull( + survey_results_with_original_features["Package Ref"] + ) + ] + + # Change this to a lookup + package_ratings = pd.DataFrame([ + { + "1A": 1, + "1B": 2, + "2A": 3, + "2B": 4, + "3A": 5, + "3B": 6, + 4: 7 + } + ]) + package_ratings = pd.melt(package_ratings, var_name="Package Ref", value_name="Rank") + + mapped_package_refs = [] + for _, property in tqdm(properties_without_packages.iterrows(), total=len(properties_without_packages)): + # Same archetype? + matches = properties_with_packages[properties_with_packages["Archetype ID"] == property["Archetype ID"]] + + if matches.empty: + # Similar property + matches = properties_with_packages[ + (properties_with_packages["Property Type"].str.split(":").str[0] == + property["Property Type"].split(":")[0]) & + (properties_with_packages["Wall Type"] == property["Wall Type"]) & + (properties_with_packages["Roof Type"].str.split(":").str[0] == property["Roof Type"].split(":")[0]) & + (properties_with_packages["Heating"].str.split(":").str[0] == property["Heating"].split(":")[0]) + ] + if matches.empty: + matches = properties_with_packages[ + (properties_with_packages["Property Type"].str.split(":").str[0] == + property["Property Type"].split(":")[0]) & + (properties_with_packages["Wall Type"].str.split(":").str[0] == property["Wall Type"].split(":")[0]) & + (properties_with_packages["Roof Type"].str.split(":").str[0] == property["Roof Type"].split(":")[0]) & + (properties_with_packages["Heating"].str.split(":").str[0] == property["Heating"].split(":")[0]) + ] + if matches.empty: + raise Exception("Implement me") + if matches.shape[0] > 1: + # Take the package with the highest rank + matches = matches.merge( + package_ratings, + on="Package Ref", + how="left" + ).sort_values("Rank", ascending=False).head(1) + + mapped_package_refs.append( + { + "Address ID": property["Address ID"], + "Matched Package Ref": matches["Package Ref"].values[0] + } + ) + + mapped_package_refs = pd.DataFrame(mapped_package_refs) + + survey_results = survey_results.merge( + mapped_package_refs, + on="Address ID", + how="left" + ) + survey_results["Package Ref"] = np.where( + pd.notnull(survey_results["Matched Package Ref"]), + survey_results["Matched Package Ref"], + survey_results["Package Ref"] + ) + survey_results = survey_results.drop(columns=["Matched Package Ref"]) + + # Do the same with survey_results_with_original_features + survey_results_with_original_features = survey_results_with_original_features.merge( + mapped_package_refs, + on="Address ID", + how="left" + ) + survey_results_with_original_features["Package Ref"] = np.where( + pd.notnull(survey_results_with_original_features["Matched Package Ref"]), + survey_results_with_original_features["Matched Package Ref"], + survey_results_with_original_features["Package Ref"] + ) + survey_results_with_original_features = survey_results_with_original_features.drop(columns=["Matched Package Ref"]) + + # Save the data for reference + # mapped_package_refs = mapped_package_refs.merge( + # asset_list[["Name", "Postcode", "Address ID", "Org. ref."]], + # on="Address ID", + # how="left" + # ) + # mapped_package_refs.to_csv(os.path.join(CUSTOMER_FOLDER_PATH, "mapped_package_refs.csv"), index=False) + # We get longitude & Latitude archetyping_spatial_features = read_pickle_from_s3( bucket_name="retrofit-data-dev", s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl", @@ -1911,7 +2036,8 @@ def propsed_wave_3_sample(): 'Current EPC Band', 'Current SAP Rating', 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', 'Survey: Main Roof Type', 'Survey: Primary Heating System', - 'Survey: Matching Address ID', 'Distance to Closest Match (m)' + 'Survey: Matching Address ID', 'Distance to Closest Match (m)', + "Package Ref" ]: region_assets[col] = np.where( pd.isnull(region_assets[col]) & pd.notnull(region_assets[col + suffix]), @@ -2027,7 +2153,7 @@ def propsed_wave_3_sample(): "Archetype ID", "Address ID", "Current EPC Band", "Current SAP Rating", 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', 'Survey: Main Roof Type', 'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)', - "Match Type" + "Match Type", "Package Ref" ] ) @@ -2183,6 +2309,13 @@ def propsed_wave_3_sample(): closest_match = surveyed.iloc[0] + # The closest property may be an EPC C, we we take the package ref from the property that's the nearest + # with non-NA package ref + if expected_epc in ["C", "B", "A"]: + package_ref = None + else: + package_ref = surveyed["Package Ref"].dropna().values[0] + final_missed_matches.append( { "Address ID": a_id, @@ -2195,7 +2328,7 @@ def propsed_wave_3_sample(): "Survey: Primary Heating System": closest_match["Survey: Primary Heating System"], "Survey: Matching Address ID": closest_match["Address ID"], 'Distance to Closest Match (m)': closest_match["distance_meters"], - "Package Ref": closest_match["Package Ref"] + "Package Ref": package_ref } ) continue @@ -2225,6 +2358,11 @@ def propsed_wave_3_sample(): results = pd.concat(results) + results[ + pd.isnull(results["Package Ref"]) & (results["Current EPC Band"] == "D") + ]["Postal Region"] + results[resul] + # Check if there are missings in current epc band, current sap rating or any of the survey attributes for c in ( [ @@ -2269,8 +2407,6 @@ def propsed_wave_3_sample(): street_summary["Gain"] = street_summary[gain_columns].sum(axis=1) street_summary["Loss"] = street_summary[loss_columns].sum(axis=1) - print(street_summary.sum()) - selected_rows, _ = optimise( gain=street_summary["Gain"].values, loss=street_summary["Loss"].values, @@ -2334,9 +2470,6 @@ def propsed_wave_3_sample(): package_summary, how="left", on="Street and Region" ) street_bid_structure = street_bid_structure.sort_values("Gain", ascending=False) - street_bid_structure.to_csv( - os.path.join(CUSTOMER_FOLDER_PATH, "Street Bid Structure.csv"), index=False - ) individual_units_programme = results.copy() individual_units_programme["Unit in Programme"] = individual_units_programme["Street and Region"].isin( @@ -2386,6 +2519,79 @@ def propsed_wave_3_sample(): .str.strip() # Strip leading/trailing spaces ) + # Any EPC C properties that have been included should be flagged as potential low carbon heating + selected_epc_c = individual_units_programme[ + (individual_units_programme["Current EPC Band"].isin(["C", "B", "A", "Needs Survey"])) & + (individual_units_programme["Unit in Programme"]) + ] + + flat_wall_map = { + "CA Cavity: F Filled Cavity": False, + "CA Cavity: A As Built": True, + "SO Solid Brick: A As Built": True, + "Not Surveyed": False + } + + heating_map = { + "BGW Post 98 Combi condens. with auto ign.": False, + "BGB Post 98 Regular condens. with auto ign.": False, + "SEK High heat retention storage heaters": False, + "SEB Modern slimline storage heaters": True, + "Not Surveyed": False + } + + infill_data = [] + for _, epc_c_property in selected_epc_c.iterrows(): + if epc_c_property["Property Type"].split(":")[0] == "Flat": + # Look for a wall insulation measure + infill = flat_wall_map[epc_c_property["Survey: Main Wall Type"]] + infill_data.append( + { + "Address ID": epc_c_property["Address ID"], + "Street and Region": epc_c_property["Street and Region"], + "Possible Flat Infill?": infill + } + ) + continue + + infill = heating_map[epc_c_property["Survey: Primary Heating System"]] + infill_data.append( + { + "Address ID": epc_c_property["Address ID"], + "Street and Region": epc_c_property["Street and Region"], + "Low Carbon Heating Infill?": infill + } + ) + infill_data = pd.DataFrame(infill_data) + + individual_units_programme = individual_units_programme.merge( + infill_data[["Address ID", 'Possible Flat Infill?', 'Low Carbon Heating Infill?']], + how="left", on="Address ID" + ) + + for c in ['Possible Flat Infill?', 'Low Carbon Heating Infill?']: + individual_units_programme[c] = individual_units_programme[c].fillna(False) + + infill_by_street = infill_data.pivot_table( + index='Street and Region', + values=['Possible Flat Infill?', 'Low Carbon Heating Infill?'], + aggfunc='sum', + fill_value=0 + ).reset_index() + + street_bid_structure = street_bid_structure.merge( + infill_by_street, how="left", on="Street and Region" + ) + + for c in ['Low Carbon Heating Infill?', 'Possible Flat Infill?']: + street_bid_structure[c] = street_bid_structure[c].fillna(0) + + street_bid_structure.to_csv( + os.path.join(CUSTOMER_FOLDER_PATH, "Street Bid Structure.csv"), index=False + ) + + # TODO: Add the full Address!!! + individual_units_programme.to_csv( os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme.csv"), index=False )