From dc1cf6d6045c5f94e2826f6ff20010e05043d1ff Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 16 Nov 2024 15:49:08 +0000 Subject: [PATCH] working on stonewater matching algorithm --- .../southend/epc_data_pull_2024_11_14.py | 4 - .../stonewater/Wave 3 Preparation.py | 133 +++++++++++++++++- etl/route_march_data_pull/app.py | 43 +++++- 3 files changed, 171 insertions(+), 9 deletions(-) diff --git a/etl/customers/southend/epc_data_pull_2024_11_14.py b/etl/customers/southend/epc_data_pull_2024_11_14.py index 14cd73be..11ddcc6f 100644 --- a/etl/customers/southend/epc_data_pull_2024_11_14.py +++ b/etl/customers/southend/epc_data_pull_2024_11_14.py @@ -229,7 +229,3 @@ def app(): filename = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/southend/southend EPC Data pull - 14 Nov " "2024.xlsx") asset_list.to_excel(filename, index=False) - - asset_list["% of the Roof with PV"].value_counts() - - asset_list[asset_list["% of the Roof with PV"] == "50.0"][["Address", "Postcode"]] diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index a5bbff7b..019c51c9 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -117,7 +117,7 @@ def extract_summary_report(pdf_path): - Fuel Bill - Address """ - + data = { "Address": None, "Postcode": None, @@ -1618,5 +1618,136 @@ def append_stonewater_id(): index=False ) + +def propsed_wave_3_sample(): + """ + Stonewater want to ensure that the properties that when selecting properties for wave 3, they choose properties + such that most of the properties within a geographical area are treatable within the bid. + Name, if we take a geographical area (which could be postal region) they want the most, and ideally all, of the + properties within that geographical area to be included within the bid + :return: + """ + + asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 " + "- Archetyped V3.1.xlsx", + header=4 + ) + # Clean address ids + asset_list = asset_list[~pd.isnull(asset_list["Address ID"])] + asset_list = asset_list[asset_list["Address ID"] != "Address ID"] + asset_list["Address ID"] = asset_list["Address ID"].astype(int) + + # Create the postal region, taking the first part of the postcode + asset_list["Postal Region"] = asset_list["Postcode"].str.split(" ").str[0] + unique_postal_regions = asset_list["Postal Region"].unique() + + # Keep just the columns we need + asset_list = asset_list[ + ["Address ID", "Archetype ID", "Postal Region", "Postcode", "Property Type", "Wall Type", "Roof Type", + "Heating"] + ] + + survey_results = pd.read_excel( + os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.24.xlsx"), + header=13, + sheet_name="Modelled Packages" + ) + + # TOOD: We probably want the actual surveyed wall, roof, heating type + survey_results = survey_results[ + ["Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Postcode"] + ] + survey_results["Postal Region"] = survey_results["Postcode"].str.split(" ").str[0] + + survey_results_with_original_features = survey_results.merge( + asset_list[["Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]], + on="Address ID", + how="left" + ) + + if survey_results_with_original_features.shape[0] != survey_results.shape[0]: + raise ValueError("Something went wrong") + + # Tier definitions + # Tier 1: We have a property in the same postal region and same archetype that was surveyed and is below EPC D + # Tier 2: We have a property in the same archetype that was surveyed and is below EPC D + # + + for region in unique_postal_regions: + # Take all of the properties in that region + region_assets = asset_list[asset_list["Postal Region"] == region].copy() + archetypes = region_assets["Archetype ID"].unique() + # We get the properties that have been surveyed + region_surveyed = survey_results[ + survey_results["Archetype ID"].isin(archetypes) & + (survey_results["Postal Region"] == region) + ][["Archetype ID", "Current EPC Band"]].drop_duplicates() + + if region_surveyed["Archetype ID"].duplicated().sum(): + raise NotImplementedError("Fix me") + + region_assets = region_assets.merge( + region_surveyed, + on="Archetype ID", + how="left" + ) + + # Label the tier 1 properties + region_assets["Confidence Tier"] = None + region_assets["Confidence Tier"] = np.where( + region_assets["Current EPC Band"].isin(["D", "E", "F", "G"]), + "1", region_assets["Confidence Tier"] + ) + # TODO: Turn into a function + missed_archetypes = set(archetypes) - set(region_surveyed["Archetype ID"]) + + region_surveyed = survey_results[ + survey_results["Archetype ID"].isin(missed_archetypes) + ][["Archetype ID", "Current EPC Band"]].drop_duplicates() + + if region_surveyed["Archetype ID"].duplicated().sum(): + raise NotImplementedError("Fix me 2") + + region_assets = region_assets.merge( + region_surveyed, + on="Archetype ID", + how="left", + suffixes=("", "_method2") + ) + + region_assets["Confidence Tier"] = np.where( + region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]), + "2 - same archetype", region_assets["Confidence Tier"] + ) + + region_assets["Current EPC Band"] = region_assets["Current EPC Band"].fillna( + region_assets["Current EPC Band_method2"]) + + region_assets = region_assets.drop(columns=["Current EPC Band_method2"]) + + missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist() + + # This means that this archetype was never surveyed and so we need to find a sufficiently similar property + for a_id in missed_addressids: + property = asset_list[asset_list["Address ID"] == a_id].squeeze() + + surveyed_same_postcode = survey_results_with_original_features[ + (survey_results_with_original_features["Postcode"] == property["Postcode"]) & + (survey_results_with_original_features["Property Type"] == property["Property Type"]) + ] + + surveyed_same_region = survey_results_with_original_features[ + (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) & + (survey_results_with_original_features["Property Type"] == property["Property Type"]) + ] + + same_postcode = survey_results[ + survey_results["Archetype ID"].isin(missed_archetypes) & + (survey_results["Postal Region"] == region) + ][["Archetype ID", "Current EPC Band"]].drop_duplicates() + + pd.isnull(region_assets["Current EPC Band"]).sum() + # if __name__ == "__main__": # main() diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 060897f8..f24c5bb2 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -206,6 +206,14 @@ def app(): # Drop the column that is "" transformed_df = transformed_df.drop(columns=[""]) + # Get the find my epc data + find_my_epc_data = epc_df[["row_id", "find_my_epc_data"]].drop(columns=["find_my_epc_data"]).join( + pd.json_normalize(epc_df["find_my_epc_data"]) + ) + # We check if we get the solar pv column: + if "Solar photovoltaics" not in find_my_epc_data.columns: + find_my_epc_data["Solar photovoltaics"] = False + # Retrieve just the data we need epc_df = epc_df[ [ @@ -228,6 +236,7 @@ def app(): "mainheat-description", # "energy-consumption-current", # kwh/m2 + "photo-supply", ] ] @@ -236,12 +245,25 @@ def app(): how="left", on="row_id" ).merge( - transformed_df, + find_my_epc_data[ + [ + "row_id", "heating_text", "hot_water_text", 'Assessor’s name', + "Assessor's Telephone", "Assessor's Email", "Accreditation scheme", + "Assessor’s ID", "Solar photovoltaics" + ] + ].rename( + columns={ + "Solar photovoltaics": "Has Solar PV", + "heating_text": "Heating Estimated kWh", + "hot_water_text": "Hot Water Estimated kWh", + } + ), how="left", on="row_id" ) - asset_list = asset_list.drop(columns=["row_id"]) + asset_list["Has Solar PV"] = asset_list["Has Solar PV"] | ~asset_list["photo-supply"].isin(["0.0", 0, None, ""]) + asset_list = asset_list.drop(columns=["photo-supply"]) # Rename the columns asset_list = asset_list.rename(columns={ @@ -259,7 +281,7 @@ def app(): "mainheat-description": "Heating Type", "secondheat-description": "Secondary Heating", "transaction-type": "Reason for last EPC", - "energy-consumption-current": "Heat Demand (kWh/m2)" + "energy-consumption-current": "Heat Demand (kWh/m2)", }) asset_list["Estimated Number of Floors"] = asset_list.apply( @@ -295,6 +317,19 @@ def app(): axis=1 ) + # For all of the columns in transformed_df, prefix with "Recommendation: " + for col in transformed_df.columns: + if col == "row_id": + continue + transformed_df = transformed_df.rename(columns={col: f"Recommendation: {col}"}) + + asset_list = asset_list.merge( + transformed_df, + how="left", + on="row_id" + ) + asset_list = asset_list.drop(columns=["row_id"]) + # Store as an excel - filename = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Settle/Settle EPC Data pull - 08 Nov 2024.xlsx" + filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull.xlsx" asset_list.to_excel(filename, index=False)