From 0796b384fb3aa8bf6cb3689c21cd5c5ac5acfc87 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 15 Nov 2024 18:42:08 +0000 Subject: [PATCH 01/31] added non-invasive rec --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- etl/customers/remote_assessments/app.py | 27 +++++++++++++++++-------- recommendations/FloorRecommendations.py | 11 +++++++++- 4 files changed, 31 insertions(+), 11 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index 762580d9..df6c4faa 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index c916a158..50cad4ca 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py index a0d01f7d..33015d87 100644 --- a/etl/customers/remote_assessments/app.py +++ b/etl/customers/remote_assessments/app.py @@ -1,7 +1,7 @@ import pandas as pd from utils.s3 import save_csv_to_s3 -PORTFOLIO_ID = 111 +PORTFOLIO_ID = 120 USER_ID = 8 @@ -13,9 +13,9 @@ def app(): asset_list = [ { - "uprn": 100050770761, - "address": "12 Sheardown Street", - "postcode": "DN4 0BH" + "uprn": 100030334057, + "address": "5, Lynton Street", + "postcode": "DE22 3RW" } ] asset_list = pd.DataFrame(asset_list) @@ -30,11 +30,22 @@ def app(): non_invasive_recommendations = [ { - "uprn": 100050770761, + "uprn": 100030334057, "recommendations": [ { - "type": "extension_cavity_wall_insulation", + "type": "internal_wall_insulation", + "sap_points": 9, + "survey": True + }, + { + "type": "external_wall_insulation", + "sap_points": 9, + "survey": True + }, + { + "type": "suspended_floor_insulation", "sap_points": 2, + "survey": True } ] } @@ -49,8 +60,8 @@ def app(): valuation_data = [ { - "uprn": 100050770761, - "value": 67_000 + "uprn": 100030334057, + "value": 133_000 } ] # Store valuation data to s3 diff --git a/recommendations/FloorRecommendations.py b/recommendations/FloorRecommendations.py index 25741e7a..ed00bbe9 100644 --- a/recommendations/FloorRecommendations.py +++ b/recommendations/FloorRecommendations.py @@ -172,6 +172,11 @@ class FloorRecommendations(Definitions): insulation_materials = pd.DataFrame(insulation_materials) + non_invasive_recs = next( + (r for r in self.property.non_invasive_recommendations if + r["type"] == insulation_materials["type"].values[0]), {} + ) + lowest_selected_u_value = None for _, insulation_material_group in insulation_materials.groupby("description"): @@ -217,6 +222,9 @@ class FloorRecommendations(Definitions): else: raise NotImplementedError("Implement me!") + sap_points = non_invasive_recs.get("sap_points", None) + survey = non_invasive_recs.get("survey", False) + floor_ending_config = FloorAttributes(new_description).process() floor_simulation_config = check_simulation_difference( new_config=floor_ending_config, old_config=self.property.floor, prefix="floor_" @@ -245,7 +253,8 @@ class FloorRecommendations(Definitions): "description": self._make_floor_description(material), "starting_u_value": u_value, "new_u_value": new_u_value, - "sap_points": None, + "sap_points": sap_points, + "survey": survey, "already_installed": already_installed, "simulation_config": simulation_config, "description_simulation": { From 2b22a6012fc11b9e94cd430d0b4ae8426293ef9e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 15 Nov 2024 21:17:37 +0000 Subject: [PATCH 02/31] remote assessment complete --- recommendations/HotwaterRecommendations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recommendations/HotwaterRecommendations.py b/recommendations/HotwaterRecommendations.py index aed1a5e5..b86329e4 100644 --- a/recommendations/HotwaterRecommendations.py +++ b/recommendations/HotwaterRecommendations.py @@ -66,7 +66,7 @@ class HotwaterRecommendations: (self.property.hotwater["heater_type"] in ["electric immersion"]) & (self.property.data["hot-water-energy-eff"] == "Very Poor") & (self.property.hotwater["no_system_present"] is None) & - len(has_tank_recommendation) == 0 + (len(has_tank_recommendation) == 0) ): self.recommend_tank_insulation(phase=phase) return From 31c5935577d6723360841f3ddb2803f82a6b6123 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 15 Nov 2024 21:58:51 +0000 Subject: [PATCH 03/31] creating route march planning app --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- etl/find_my_epc/RetrieveFindMyEpc.py | 25 +- etl/route_march_data_pull/app.py | 300 +++++++++++++++++++++ etl/route_march_data_pull/requirements.txt | 0 5 files changed, 326 insertions(+), 3 deletions(-) create mode 100644 etl/route_march_data_pull/app.py create mode 100644 etl/route_march_data_pull/requirements.txt diff --git a/.idea/Model.iml b/.idea/Model.iml index df6c4faa..762580d9 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 50cad4ca..c916a158 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index cd76dae4..913a04b8 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -26,6 +26,20 @@ class RetrieveFindMyEpc: self.address_cleaned = self.address.replace(",", "").replace(" ", "").lower() + @staticmethod + def extract_low_carbon_sources(soup): + # Find the section header + section_header = soup.find("h3", string="Low and zero carbon energy sources") + if not section_header: + return {} + + # Locate the list following the header + energy_list = section_header.find_next("ul") + + # Extract the list items + sources = {item.get_text(strip=True): True for item in energy_list.find_all("li")} + return sources + def retrieve_newest_find_my_epc_data(self, sap_2012_date=None): """ For a post code and address, we pull out all the required data from the find my epc website @@ -191,6 +205,9 @@ class RetrieveFindMyEpc: # Finally, we format the recommendations recommendations = self.format_recommendations(recommendations, assessment_data, sap_2012_date) + # 4) Low and zero carbon energy sources + low_carbon_energy_sources = self.extract_low_carbon_sources(address_res) + resulting_data = { 'epc_certificate': epc_certificate, 'current_epc_rating': current_rating.split(' ')[-6], @@ -200,7 +217,8 @@ class RetrieveFindMyEpc: "heating_text": heating_text, "hot_water_text": hot_water_text, "recommendations": recommendations, - **assessment_data + **assessment_data, + **low_carbon_energy_sources } return resulting_data @@ -246,6 +264,11 @@ class RetrieveFindMyEpc: ], "Band A condensing boiler": ["boiler_upgrade"], "Double glazing": ["double_glazing"], + "Flue gas heat recovery device in conjunction with boiler": ["flue_gas_heat_recovery"], + "Wind turbine": ["wind_turbine"], + "Loft insulation": ["loft_insulation"], + "Solar photovoltaic (PV) panels": ["solar_pv"], + "Party wall insulation": ["party_wall_insulation"], } survey = True diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py new file mode 100644 index 00000000..060897f8 --- /dev/null +++ b/etl/route_march_data_pull/app.py @@ -0,0 +1,300 @@ +import os +import time + +import pandas as pd +import numpy as np +from tqdm import tqdm + +from dotenv import load_dotenv +from backend.SearchEpc import SearchEpc +from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc +from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes + +from recommendations.recommendation_utils import ( + estimate_perimeter, + estimate_external_wall_area, + estimate_number_of_floors +) + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + + +def get_data(asset_list, fulladdress_column, address1_column, postcode_column): + epc_data = [] + errors = [] + for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): + postcode = home[postcode_column] + house_number = home[address1_column] + full_address = home[fulladdress_column] + + searcher = SearchEpc( + address1=str(house_number), + postcode=postcode, + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address, + max_retries=5 + ) + # Force the skipping of estimating the EPC + searcher.ordnance_survey_client.property_type = None + searcher.ordnance_survey_client.built_form = None + + searcher.find_property(skip_os=True) + if searcher.newest_epc is None: + continue + + # Look for EPC recommendatons + try: + property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) + except: + property_recommendations = {"rows": []} + + # Retrieve data from FindMyEPC + find_epc_searcher = RetrieveFindMyEpc( + address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"] + ) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + time.sleep(np.random.uniform(0.1, 1)) + try: + postcode = home[postcode_column] + house_number = home[address1_column] + full_address = home[fulladdress_column] + + searcher = SearchEpc( + address1=str(house_number), + postcode=postcode, + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address, + max_retries=5 + ) + # Force the skipping of estimating the EPC + searcher.ordnance_survey_client.property_type = None + searcher.ordnance_survey_client.built_form = None + + searcher.find_property(skip_os=True) + if searcher.newest_epc is None: + continue + + # Look for EPC recommendatons + try: + property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) + except: + property_recommendations = {"rows": []} + + # Retrieve data from FindMyEPC + find_epc_searcher = RetrieveFindMyEpc( + address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"] + ) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + time.sleep(np.random.uniform(0.1, 1)) + + epc = { + "row_id": home["row_id"], + **searcher.newest_epc.copy(), + "recommendations": property_recommendations["rows"], + "find_my_epc_data": find_epc_data, + } + + epc_data.append(epc) + except Exception as e: + errors.append(home["row_id"]) + time.sleep(5) + + return epc_data, errors + + +def extract_address1(asset_list, full_address_col, method="first_two_words"): + if method == "first_two_words": + asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ") + return asset_list + + raise ValueError(f"Method {method} not recognized") + + +def app(): + """ + This app is EPC pulling data for some properties owned by Livewest + + Data request contents: + Date of last EPC + Reason for EPC + SAP score on register + Property Type + Property Area + Property Age + Any Dimensions (HLP,PW,RH) + Property Wall Construction + Heating Type + Secondary Heating + Loft Insulation Depth + + Additional if possible: + Heat loss calculations + EPC recommendations + Property UPRN + + """ + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/P&F/" + DATA_FILENAME = "BELOW C - WFT FINDINGS ON INSPECTION PLUS SUGGESTED ACTION.xlsx" + POSTCODE_COLUMN = "Postcode" + FULLADDRESS_COLUMN = "Address" + ADDRESS1_COLUMN = None + ADDRESS1_METHOD = "first_two_words" + + asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0) + asset_list["row_id"] = asset_list.index + + # We clean up portential non-breaking spaces, and double spaces + for col in [c for c in [POSTCODE_COLUMN, FULLADDRESS_COLUMN, ADDRESS1_COLUMN] if c is not None]: + asset_list[col] = asset_list[col].str.replace('\xa0', ' ', regex=False) + asset_list[col] = asset_list[col].str.replace(' ', ' ', regex=False) + + if ADDRESS1_COLUMN is None: + ADDRESS1_COLUMN = "address1_extracted" + asset_list = extract_address1(asset_list, FULLADDRESS_COLUMN, ADDRESS1_METHOD) + + epc_data, errors = get_data( + asset_list=asset_list, + fulladdress_column=FULLADDRESS_COLUMN, + address1_column=ADDRESS1_COLUMN, + postcode_column=POSTCODE_COLUMN + ) + + # We now retrieve any failed properties + asset_list_failed = asset_list[asset_list["row_id"].isin(errors)] + epc_data_failed, _ = get_data( + asset_list=asset_list_failed, + fulladdress_column=FULLADDRESS_COLUMN, + address1_column=ADDRESS1_COLUMN, + postcode_column=POSTCODE_COLUMN + ) + + # Append the failed data to the main data + epc_data.extend(epc_data_failed) + + epc_df = pd.DataFrame(epc_data) + + # We expand out the recommendations + recommendations_df = epc_df[["row_id", "recommendations"]] + + unique_recommendations = set() + for _, row in recommendations_df.iterrows(): + unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]]) + + columns = ["row_id"] + list(unique_recommendations) + transformed_data = [] + for _, row in recommendations_df.iterrows(): + # Initialize a dictionary for this row with False for all recommendations + row_data = {col: False for col in columns} + row_data["row_id"] = row["row_id"] + + # Set True for each recommendation present in this row + for rec in row["recommendations"]: + recommendation_text = rec["improvement-summary-text"] + row_data[recommendation_text] = True + + # Append the row data to transformed_data + transformed_data.append(row_data) + + transformed_df = pd.DataFrame(transformed_data) + # Drop the column that is "" + transformed_df = transformed_df.drop(columns=[""]) + + # Retrieve just the data we need + epc_df = epc_df[ + [ + "row_id", + "uprn", + "property-type", + "built-form", + "inspection-date", + "current-energy-rating", + "current-energy-efficiency", + "roof-description", + "walls-description", + "transaction-type", + # New fields needed + "secondheat-description", + "total-floor-area", + "construction-age-band", + "floor-height", + "number-habitable-rooms", + "mainheat-description", + # + "energy-consumption-current", # kwh/m2 + ] + ] + + asset_list = asset_list.merge( + epc_df, + how="left", + on="row_id" + ).merge( + transformed_df, + how="left", + on="row_id" + ) + + asset_list = asset_list.drop(columns=["row_id"]) + + # Rename the columns + asset_list = asset_list.rename(columns={ + "inspection-date": "Date of last EPC", + "current-energy-efficiency": "SAP score on register", + "current-energy-rating": "EPC rating on register", + "property-type": "Property Type", + "built-form": "Archetype", + "total-floor-area": "Property Floor Area", + "construction-age-band": "Property Age Band", + "floor-height": "Property Floor Height", + "number-habitable-rooms": "Number of Habitable Rooms", + "walls-description": "Wall Construction", + "roof-description": "Roof Construction", + "mainheat-description": "Heating Type", + "secondheat-description": "Secondary Heating", + "transaction-type": "Reason for last EPC", + "energy-consumption-current": "Heat Demand (kWh/m2)" + }) + + asset_list["Estimated Number of Floors"] = asset_list.apply( + lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull( + x["Property Type"]) else None, axis=1 + ) + + asset_list["Property Floor Area"] = asset_list["Property Floor Area"].astype(float) + # Replace "" value with None + asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].replace("", None) + asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].astype(float) + + asset_list["Estimated Perimeter (m)"] = asset_list.apply( + lambda x: estimate_perimeter( + floor_area=x["Property Floor Area"] / x["Estimated Number of Floors"], + num_rooms=x["Number of Habitable Rooms"] / x["Estimated Number of Floors"], + ), axis=1 + ) + + asset_list["Estimated Heat Loss Perimeter (m2)"] = asset_list.apply( + lambda x: estimate_external_wall_area( + num_floors=x["Estimated Number of Floors"], + floor_height=float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5, + perimeter=x["Estimated Perimeter (m)"], + built_form=x["Archetype"] + ), + axis=1 + ) + + asset_list["Roof Insulation Thickness"] = asset_list.apply( + lambda x: RoofAttributes(description=x["Roof Construction"]).process()["insulation_thickness"] if not pd.isnull( + x["Roof Construction"]) else None, + axis=1 + ) + + # Store as an excel + filename = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Settle/Settle EPC Data pull - 08 Nov 2024.xlsx" + asset_list.to_excel(filename, index=False) diff --git a/etl/route_march_data_pull/requirements.txt b/etl/route_march_data_pull/requirements.txt new file mode 100644 index 00000000..e69de29b From dc1cf6d6045c5f94e2826f6ff20010e05043d1ff Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 16 Nov 2024 15:49:08 +0000 Subject: [PATCH 04/31] working on stonewater matching algorithm --- .../southend/epc_data_pull_2024_11_14.py | 4 - .../stonewater/Wave 3 Preparation.py | 133 +++++++++++++++++- etl/route_march_data_pull/app.py | 43 +++++- 3 files changed, 171 insertions(+), 9 deletions(-) diff --git a/etl/customers/southend/epc_data_pull_2024_11_14.py b/etl/customers/southend/epc_data_pull_2024_11_14.py index 14cd73be..11ddcc6f 100644 --- a/etl/customers/southend/epc_data_pull_2024_11_14.py +++ b/etl/customers/southend/epc_data_pull_2024_11_14.py @@ -229,7 +229,3 @@ def app(): filename = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/southend/southend EPC Data pull - 14 Nov " "2024.xlsx") asset_list.to_excel(filename, index=False) - - asset_list["% of the Roof with PV"].value_counts() - - asset_list[asset_list["% of the Roof with PV"] == "50.0"][["Address", "Postcode"]] diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index a5bbff7b..019c51c9 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -117,7 +117,7 @@ def extract_summary_report(pdf_path): - Fuel Bill - Address """ - + data = { "Address": None, "Postcode": None, @@ -1618,5 +1618,136 @@ def append_stonewater_id(): index=False ) + +def propsed_wave_3_sample(): + """ + Stonewater want to ensure that the properties that when selecting properties for wave 3, they choose properties + such that most of the properties within a geographical area are treatable within the bid. + Name, if we take a geographical area (which could be postal region) they want the most, and ideally all, of the + properties within that geographical area to be included within the bid + :return: + """ + + asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 " + "- Archetyped V3.1.xlsx", + header=4 + ) + # Clean address ids + asset_list = asset_list[~pd.isnull(asset_list["Address ID"])] + asset_list = asset_list[asset_list["Address ID"] != "Address ID"] + asset_list["Address ID"] = asset_list["Address ID"].astype(int) + + # Create the postal region, taking the first part of the postcode + asset_list["Postal Region"] = asset_list["Postcode"].str.split(" ").str[0] + unique_postal_regions = asset_list["Postal Region"].unique() + + # Keep just the columns we need + asset_list = asset_list[ + ["Address ID", "Archetype ID", "Postal Region", "Postcode", "Property Type", "Wall Type", "Roof Type", + "Heating"] + ] + + survey_results = pd.read_excel( + os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.24.xlsx"), + header=13, + sheet_name="Modelled Packages" + ) + + # TOOD: We probably want the actual surveyed wall, roof, heating type + survey_results = survey_results[ + ["Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Postcode"] + ] + survey_results["Postal Region"] = survey_results["Postcode"].str.split(" ").str[0] + + survey_results_with_original_features = survey_results.merge( + asset_list[["Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]], + on="Address ID", + how="left" + ) + + if survey_results_with_original_features.shape[0] != survey_results.shape[0]: + raise ValueError("Something went wrong") + + # Tier definitions + # Tier 1: We have a property in the same postal region and same archetype that was surveyed and is below EPC D + # Tier 2: We have a property in the same archetype that was surveyed and is below EPC D + # + + for region in unique_postal_regions: + # Take all of the properties in that region + region_assets = asset_list[asset_list["Postal Region"] == region].copy() + archetypes = region_assets["Archetype ID"].unique() + # We get the properties that have been surveyed + region_surveyed = survey_results[ + survey_results["Archetype ID"].isin(archetypes) & + (survey_results["Postal Region"] == region) + ][["Archetype ID", "Current EPC Band"]].drop_duplicates() + + if region_surveyed["Archetype ID"].duplicated().sum(): + raise NotImplementedError("Fix me") + + region_assets = region_assets.merge( + region_surveyed, + on="Archetype ID", + how="left" + ) + + # Label the tier 1 properties + region_assets["Confidence Tier"] = None + region_assets["Confidence Tier"] = np.where( + region_assets["Current EPC Band"].isin(["D", "E", "F", "G"]), + "1", region_assets["Confidence Tier"] + ) + # TODO: Turn into a function + missed_archetypes = set(archetypes) - set(region_surveyed["Archetype ID"]) + + region_surveyed = survey_results[ + survey_results["Archetype ID"].isin(missed_archetypes) + ][["Archetype ID", "Current EPC Band"]].drop_duplicates() + + if region_surveyed["Archetype ID"].duplicated().sum(): + raise NotImplementedError("Fix me 2") + + region_assets = region_assets.merge( + region_surveyed, + on="Archetype ID", + how="left", + suffixes=("", "_method2") + ) + + region_assets["Confidence Tier"] = np.where( + region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]), + "2 - same archetype", region_assets["Confidence Tier"] + ) + + region_assets["Current EPC Band"] = region_assets["Current EPC Band"].fillna( + region_assets["Current EPC Band_method2"]) + + region_assets = region_assets.drop(columns=["Current EPC Band_method2"]) + + missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist() + + # This means that this archetype was never surveyed and so we need to find a sufficiently similar property + for a_id in missed_addressids: + property = asset_list[asset_list["Address ID"] == a_id].squeeze() + + surveyed_same_postcode = survey_results_with_original_features[ + (survey_results_with_original_features["Postcode"] == property["Postcode"]) & + (survey_results_with_original_features["Property Type"] == property["Property Type"]) + ] + + surveyed_same_region = survey_results_with_original_features[ + (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) & + (survey_results_with_original_features["Property Type"] == property["Property Type"]) + ] + + same_postcode = survey_results[ + survey_results["Archetype ID"].isin(missed_archetypes) & + (survey_results["Postal Region"] == region) + ][["Archetype ID", "Current EPC Band"]].drop_duplicates() + + pd.isnull(region_assets["Current EPC Band"]).sum() + # if __name__ == "__main__": # main() diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 060897f8..f24c5bb2 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -206,6 +206,14 @@ def app(): # Drop the column that is "" transformed_df = transformed_df.drop(columns=[""]) + # Get the find my epc data + find_my_epc_data = epc_df[["row_id", "find_my_epc_data"]].drop(columns=["find_my_epc_data"]).join( + pd.json_normalize(epc_df["find_my_epc_data"]) + ) + # We check if we get the solar pv column: + if "Solar photovoltaics" not in find_my_epc_data.columns: + find_my_epc_data["Solar photovoltaics"] = False + # Retrieve just the data we need epc_df = epc_df[ [ @@ -228,6 +236,7 @@ def app(): "mainheat-description", # "energy-consumption-current", # kwh/m2 + "photo-supply", ] ] @@ -236,12 +245,25 @@ def app(): how="left", on="row_id" ).merge( - transformed_df, + find_my_epc_data[ + [ + "row_id", "heating_text", "hot_water_text", 'Assessor’s name', + "Assessor's Telephone", "Assessor's Email", "Accreditation scheme", + "Assessor’s ID", "Solar photovoltaics" + ] + ].rename( + columns={ + "Solar photovoltaics": "Has Solar PV", + "heating_text": "Heating Estimated kWh", + "hot_water_text": "Hot Water Estimated kWh", + } + ), how="left", on="row_id" ) - asset_list = asset_list.drop(columns=["row_id"]) + asset_list["Has Solar PV"] = asset_list["Has Solar PV"] | ~asset_list["photo-supply"].isin(["0.0", 0, None, ""]) + asset_list = asset_list.drop(columns=["photo-supply"]) # Rename the columns asset_list = asset_list.rename(columns={ @@ -259,7 +281,7 @@ def app(): "mainheat-description": "Heating Type", "secondheat-description": "Secondary Heating", "transaction-type": "Reason for last EPC", - "energy-consumption-current": "Heat Demand (kWh/m2)" + "energy-consumption-current": "Heat Demand (kWh/m2)", }) asset_list["Estimated Number of Floors"] = asset_list.apply( @@ -295,6 +317,19 @@ def app(): axis=1 ) + # For all of the columns in transformed_df, prefix with "Recommendation: " + for col in transformed_df.columns: + if col == "row_id": + continue + transformed_df = transformed_df.rename(columns={col: f"Recommendation: {col}"}) + + asset_list = asset_list.merge( + transformed_df, + how="left", + on="row_id" + ) + asset_list = asset_list.drop(columns=["row_id"]) + # Store as an excel - filename = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Settle/Settle EPC Data pull - 08 Nov 2024.xlsx" + filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull.xlsx" asset_list.to_excel(filename, index=False) From c13c84b98cbab169300306adeba534145496251c Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 16 Nov 2024 15:55:19 +0000 Subject: [PATCH 05/31] First region implemented --- .../stonewater/Wave 3 Preparation.py | 58 +++++++++++++++---- 1 file changed, 47 insertions(+), 11 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 019c51c9..7c104f97 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1729,25 +1729,61 @@ def propsed_wave_3_sample(): missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist() # This means that this archetype was never surveyed and so we need to find a sufficiently similar property + final_missed_matches = [] for a_id in missed_addressids: property = asset_list[asset_list["Address ID"] == a_id].squeeze() - surveyed_same_postcode = survey_results_with_original_features[ + # TODO: This is quite strict for the moment - we might want to relax this by creating reduced versions + # of the wall, roof and heating features, splitting them on the colons and taking the first part + surveyed_similar = survey_results_with_original_features[ (survey_results_with_original_features["Postcode"] == property["Postcode"]) & - (survey_results_with_original_features["Property Type"] == property["Property Type"]) + (survey_results_with_original_features["Property Type"] == property["Property Type"]) & + (survey_results_with_original_features["Wall Type"] == property["Wall Type"]) & + (survey_results_with_original_features["Roof Type"] == property["Roof Type"]) & + (survey_results_with_original_features["Heating"] == property["Heating"]) ] + if surveyed_similar.empty: + surveyed_similar = survey_results_with_original_features[ + (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) & + (survey_results_with_original_features["Property Type"] == property["Property Type"]) & + (survey_results_with_original_features["Wall Type"] == property["Wall Type"]) & + (survey_results_with_original_features["Roof Type"] == property["Roof Type"]) & + (survey_results_with_original_features["Heating"] == property["Heating"]) + ] - surveyed_same_region = survey_results_with_original_features[ - (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) & - (survey_results_with_original_features["Property Type"] == property["Property Type"]) - ] + if surveyed_similar.empty: + final_missed_matches.append( + { + "Address ID": a_id, + "Confidence Tier": "5 - no similar property, needs survey to confirm" + } + ) + continue - same_postcode = survey_results[ - survey_results["Archetype ID"].isin(missed_archetypes) & - (survey_results["Postal Region"] == region) - ][["Archetype ID", "Current EPC Band"]].drop_duplicates() + raise NotImplementedError("Implement me") - pd.isnull(region_assets["Current EPC Band"]).sum() + final_missed_matches = pd.DataFrame(final_missed_matches) + + region_assets = region_assets.merge( + final_missed_matches, + on="Address ID", + how="left", + suffixes=("", "_method3") + ) + + region_assets["Confidence Tier"] = region_assets["Confidence Tier"].fillna( + region_assets["Confidence Tier_method3"] + ) + + region_assets = region_assets.drop(columns=["Confidence Tier_method3"]) + + region_assets["Current EPC Band"] = np.where( + region_assets["Confidence Tier"] == "5 - no similar property, needs survey to confirm", + "Unknown", region_assets["Current EPC Band"] + ) + + if pd.isnull(region_assets["Current EPC Band"]).sum(): + raise Exception("Something went wrong") # if __name__ == "__main__": # main() From 8f9b8f08862cbadcbd0daaa29219cd0980606b3f Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 16 Nov 2024 16:30:23 +0000 Subject: [PATCH 06/31] working on algorithm --- etl/customers/stonewater/Wave 3 Preparation.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 7c104f97..008fd3bc 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1633,6 +1633,9 @@ def propsed_wave_3_sample(): "- Archetyped V3.1.xlsx", header=4 ) + + # TODO: We drop 302 properties that are not priority postcodes - confirm w/ Stonewater + asset_list = asset_list[asset_list["Archetype ID"] == "NOT PRIORITY POSTCODE"] # Clean address ids asset_list = asset_list[~pd.isnull(asset_list["Address ID"])] asset_list = asset_list[asset_list["Address ID"] != "Address ID"] @@ -1674,6 +1677,7 @@ def propsed_wave_3_sample(): # Tier 2: We have a property in the same archetype that was surveyed and is below EPC D # + results = [] for region in unique_postal_regions: # Take all of the properties in that region region_assets = asset_list[asset_list["Postal Region"] == region].copy() @@ -1722,10 +1726,17 @@ def propsed_wave_3_sample(): ) region_assets["Current EPC Band"] = region_assets["Current EPC Band"].fillna( - region_assets["Current EPC Band_method2"]) + region_assets["Current EPC Band_method2"].astype(str), + ) region_assets = region_assets.drop(columns=["Current EPC Band_method2"]) + # We label EPC C properties + region_assets["Confidence Tier"] = np.where( + region_assets["Current EPC Band"].isin(["C", "B", "A"]), + "6 - EPC C or above", region_assets["Confidence Tier"] + ) + missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist() # This means that this archetype was never surveyed and so we need to find a sufficiently similar property @@ -1785,5 +1796,7 @@ def propsed_wave_3_sample(): if pd.isnull(region_assets["Current EPC Band"]).sum(): raise Exception("Something went wrong") + results.append(region_assets) + # if __name__ == "__main__": # main() From 2158ab2cd50df7edcfc7e119b56237145f4f1dd1 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 16 Nov 2024 16:33:43 +0000 Subject: [PATCH 07/31] debugging stoneater alg --- etl/customers/stonewater/Wave 3 Preparation.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 008fd3bc..ef7dd414 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1635,7 +1635,7 @@ def propsed_wave_3_sample(): ) # TODO: We drop 302 properties that are not priority postcodes - confirm w/ Stonewater - asset_list = asset_list[asset_list["Archetype ID"] == "NOT PRIORITY POSTCODE"] + asset_list = asset_list[asset_list["Archetype ID"] != "NOT PRIORITY POSTCODE"] # Clean address ids asset_list = asset_list[~pd.isnull(asset_list["Address ID"])] asset_list = asset_list[asset_list["Address ID"] != "Address ID"] @@ -1678,7 +1678,7 @@ def propsed_wave_3_sample(): # results = [] - for region in unique_postal_regions: + for region in tqdm(unique_postal_regions): # Take all of the properties in that region region_assets = asset_list[asset_list["Postal Region"] == region].copy() archetypes = region_assets["Archetype ID"].unique() @@ -1739,7 +1739,11 @@ def propsed_wave_3_sample(): missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist() - # This means that this archetype was never surveyed and so we need to find a sufficiently similar property + if not missed_addressids: + results.append(region_assets) + continue + + # This means that this archetype was never surveyed and so we need to find a sufficiently similar property final_missed_matches = [] for a_id in missed_addressids: property = asset_list[asset_list["Address ID"] == a_id].squeeze() From 4d021f0ba6a5894659275d8090e1f65be6ca68f6 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 16 Nov 2024 17:12:55 +0000 Subject: [PATCH 08/31] working on stonewater alg --- .../stonewater/Wave 3 Preparation.py | 102 +++++++++++++++--- 1 file changed, 86 insertions(+), 16 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index ef7dd414..40dfd38e 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3,6 +3,7 @@ import PyPDF2 import re import pandas as pd import numpy as np +from docutils.utils.math.tex2mathml_extern import blahtexml from tqdm import tqdm from collections import Counter @@ -1681,19 +1682,15 @@ def propsed_wave_3_sample(): for region in tqdm(unique_postal_regions): # Take all of the properties in that region region_assets = asset_list[asset_list["Postal Region"] == region].copy() - archetypes = region_assets["Archetype ID"].unique() - # We get the properties that have been surveyed - region_surveyed = survey_results[ - survey_results["Archetype ID"].isin(archetypes) & - (survey_results["Postal Region"] == region) - ][["Archetype ID", "Current EPC Band"]].drop_duplicates() - if region_surveyed["Archetype ID"].duplicated().sum(): - raise NotImplementedError("Fix me") + # We have a tier 1 match if the property itself was surveyed + exact_surveyed = survey_results[ + survey_results["Address ID"].isin(region_assets["Address ID"]) + ] region_assets = region_assets.merge( - region_surveyed, - on="Archetype ID", + exact_surveyed[["Address ID", "Current EPC Band"]], + on="Address ID", how="left" ) @@ -1701,22 +1698,95 @@ def propsed_wave_3_sample(): region_assets["Confidence Tier"] = None region_assets["Confidence Tier"] = np.where( region_assets["Current EPC Band"].isin(["D", "E", "F", "G"]), - "1", region_assets["Confidence Tier"] + "1 - property was surveyed", region_assets["Confidence Tier"] ) - # TODO: Turn into a function - missed_archetypes = set(archetypes) - set(region_surveyed["Archetype ID"]) + region_assets["Confidence Tier"] = np.where( + region_assets["Current EPC Band"].isin(["C", "B", "A"]), + "6 - property was surveyed", region_assets["Confidence Tier"] + ) + + archetypes = region_assets[ + pd.isnull(region_assets["Confidence Tier"]) + ]["Archetype ID"].unique() + # We get the properties that have been surveyed region_surveyed = survey_results[ - survey_results["Archetype ID"].isin(missed_archetypes) - ][["Archetype ID", "Current EPC Band"]].drop_duplicates() + survey_results["Archetype ID"].isin(archetypes) & + (survey_results["Postal Region"] == region) + ][["Archetype ID", "Current EPC Band"]].drop_duplicates() if region_surveyed["Archetype ID"].duplicated().sum(): - raise NotImplementedError("Fix me 2") + # Take the duplicated archetypes + duplicated_archetypes = region_surveyed[ + region_surveyed["Archetype ID"].duplicated() + ]["Archetype ID"].unique() + duplicated_archetypes = region_surveyed[ + region_surveyed["Archetype ID"].isin(duplicated_archetypes) + ] + + # We need to select which one is the most relevant to these properties + survey_data = survey_results_with_original_features[ + survey_results_with_original_features["Archetype ID"].isin(duplicated_archetypes["Archetype ID"].values) + ] + + raise NotImplementedError("Fix me") region_assets = region_assets.merge( region_surveyed, on="Archetype ID", how="left", + suffixes=("", "_method1") + ) + + # Label the tier 1 properties + region_assets["Confidence Tier"] = np.where( + region_assets["Current EPC Band_method1"].isin(["D", "E", "F", "G"]) & + pd.isnull(region_assets["Confidence Tier"]), + "1 - Archetype surveyed", region_assets["Confidence Tier"] + ) + region_assets = region_assets.drop(columns=["Current EPC Band_method1"]) + # TODO: Turn into a function + missed_archetypes = set(archetypes) - set(region_surveyed["Archetype ID"]) + + archetype_surveyed = survey_results[ + survey_results["Archetype ID"].isin(missed_archetypes) + ][["Archetype ID", "Current EPC Band"]].drop_duplicates() + + if archetype_surveyed["Archetype ID"].duplicated().sum(): + # We need to select which one is the most relevant to these properties + duplicated_archetypes = archetype_surveyed[ + archetype_surveyed["Archetype ID"].duplicated() + ]["Archetype ID"].unique() + + survey_data = survey_results_with_original_features[ + survey_results_with_original_features["Archetype ID"].isin(duplicated_archetypes) + ] + + homes_with_these_archetypes = region_assets[ + region_assets["Archetype ID"].isin(duplicated_archetypes) + ] + + for _, home in homes_with_these_archetypes.iterrows(): + first_filter = survey_data[ + (survey_data["Postal Region"] == home["Postal Region"]) & + (survey_data["Property Type"] == home["Property Type"]) & + (survey_data["Wall Type"].str.split(":").str[0] == home["Wall Type"].split(":")[0]) + ] + + if not first_filter.empty: + NotImplementedError("Fix me 0") + + second_filter = survey_data[ + (survey_data["Property Type"].str.split(":").str[0] == home["Property Type"].split(":")[0]) & + (survey_data["Wall Type"].str.split(":").str[0] == home["Wall Type"].split(":")[0]) + ] + + raise NotImplementedError("Fix me 2") + + region_assets = region_assets.merge( + archetype_surveyed, + on="Archetype ID", + how="left", suffixes=("", "_method2") ) From d00c291c17dacb545eef4b708047ec5c699baf18 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 17 Nov 2024 15:16:54 +0000 Subject: [PATCH 09/31] debugging stonewater algorithm --- .../stonewater/Wave 3 Preparation.py | 68 +++++++------------ 1 file changed, 25 insertions(+), 43 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 40dfd38e..5b1e2f91 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1716,20 +1716,11 @@ def propsed_wave_3_sample(): ][["Archetype ID", "Current EPC Band"]].drop_duplicates() if region_surveyed["Archetype ID"].duplicated().sum(): - # Take the duplicated archetypes - duplicated_archetypes = region_surveyed[ - region_surveyed["Archetype ID"].duplicated() - ]["Archetype ID"].unique() - duplicated_archetypes = region_surveyed[ - region_surveyed["Archetype ID"].isin(duplicated_archetypes) - ] - - # We need to select which one is the most relevant to these properties - survey_data = survey_results_with_original_features[ - survey_results_with_original_features["Archetype ID"].isin(duplicated_archetypes["Archetype ID"].values) - ] - - raise NotImplementedError("Fix me") + region_surveyed = survey_results[ + survey_results["Archetype ID"].isin(archetypes) & + (survey_results["Postal Region"] == region) + ].groupby("Archetype ID")[["Current SAP Rating"]].mean().reset_index() + region_surveyed["Current EPC Band"] = region_surveyed["Current SAP Rating"].apply(sap_to_epc) region_assets = region_assets.merge( region_surveyed, @@ -1744,6 +1735,17 @@ def propsed_wave_3_sample(): pd.isnull(region_assets["Confidence Tier"]), "1 - Archetype surveyed", region_assets["Confidence Tier"] ) + + region_assets["Current EPC Band"] = np.where( + pd.isnull(region_assets["Current EPC Band"]) & pd.notnull(region_assets["Current EPC Band_method1"]), + region_assets["Current EPC Band_method1"], region_assets["Current EPC Band"] + ) + # Handle EPC C + region_assets["Confidence Tier"] = np.where( + region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]), + "6 - EPC C or above", region_assets["Confidence Tier"] + ) + region_assets = region_assets.drop(columns=["Current EPC Band_method1"]) # TODO: Turn into a function missed_archetypes = set(archetypes) - set(region_surveyed["Archetype ID"]) @@ -1752,36 +1754,16 @@ def propsed_wave_3_sample(): survey_results["Archetype ID"].isin(missed_archetypes) ][["Archetype ID", "Current EPC Band"]].drop_duplicates() + # TODO - We could average the property?? And call it borderline, call out it was averaged!!! + # We could also find the nearest property to it, with similar wall, roof, heating? + # Can use long/lag to distance calc. We have this data from previous + if archetype_surveyed["Archetype ID"].duplicated().sum(): - # We need to select which one is the most relevant to these properties - duplicated_archetypes = archetype_surveyed[ - archetype_surveyed["Archetype ID"].duplicated() - ]["Archetype ID"].unique() - - survey_data = survey_results_with_original_features[ - survey_results_with_original_features["Archetype ID"].isin(duplicated_archetypes) - ] - - homes_with_these_archetypes = region_assets[ - region_assets["Archetype ID"].isin(duplicated_archetypes) - ] - - for _, home in homes_with_these_archetypes.iterrows(): - first_filter = survey_data[ - (survey_data["Postal Region"] == home["Postal Region"]) & - (survey_data["Property Type"] == home["Property Type"]) & - (survey_data["Wall Type"].str.split(":").str[0] == home["Wall Type"].split(":")[0]) - ] - - if not first_filter.empty: - NotImplementedError("Fix me 0") - - second_filter = survey_data[ - (survey_data["Property Type"].str.split(":").str[0] == home["Property Type"].split(":")[0]) & - (survey_data["Wall Type"].str.split(":").str[0] == home["Wall Type"].split(":")[0]) - ] - - raise NotImplementedError("Fix me 2") + archetype_surveyed = survey_results[ + survey_results["Archetype ID"].isin(missed_archetypes) + ].groupby("Archetype ID")[["Current SAP Rating"]].mean().reset_index() + archetype_surveyed["Current EPC Band"] = archetype_surveyed["Current SAP Rating"].apply(sap_to_epc) + archetype_surveyed = archetype_surveyed.drop(columns=["Current SAP Rating"]) region_assets = region_assets.merge( archetype_surveyed, From 05cf7514783786261f7efe70eda5486712f8fb4c Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 17 Nov 2024 16:00:59 +0000 Subject: [PATCH 10/31] debuggin --- .../stonewater/Wave 3 Preparation.py | 32 +++++++++++++------ 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 5b1e2f91..d2110de8 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1777,8 +1777,9 @@ def propsed_wave_3_sample(): "2 - same archetype", region_assets["Confidence Tier"] ) - region_assets["Current EPC Band"] = region_assets["Current EPC Band"].fillna( - region_assets["Current EPC Band_method2"].astype(str), + region_assets["Current EPC Band"] = np.where( + pd.isnull(region_assets["Current EPC Band"]) & pd.notnull(region_assets["Current EPC Band_method2"]), + region_assets["Current EPC Band_method2"], region_assets["Current EPC Band"] ) region_assets = region_assets.drop(columns=["Current EPC Band_method2"]) @@ -1822,12 +1823,26 @@ def propsed_wave_3_sample(): final_missed_matches.append( { "Address ID": a_id, - "Confidence Tier": "5 - no similar property, needs survey to confirm" + "Confidence Tier": "5 - no similar property, needs survey to confirm", + "Current EPC Band": "Unknown" } ) continue + # We take an average + expected_sap = surveyed_similar["Current SAP Rating"].mean() + expected_epc = sap_to_epc(expected_sap) + if expected_epc in ["C", "B", "A"]: + tier = "6 - EPC C or above" + else: + tier = "3 - similar property" - raise NotImplementedError("Implement me") + final_missed_matches.append( + { + "Address ID": a_id, + "Confidence Tier": tier, + "Current EPC Band": "Unknown" + } + ) final_missed_matches = pd.DataFrame(final_missed_matches) @@ -1841,14 +1856,13 @@ def propsed_wave_3_sample(): region_assets["Confidence Tier"] = region_assets["Confidence Tier"].fillna( region_assets["Confidence Tier_method3"] ) + region_assets["Current EPC Band"] = np.where( + pd.isnull(region_assets["Current EPC Band"]), + region_assets["Current EPC Band_method3"], region_assets["Current EPC Band"] + ) region_assets = region_assets.drop(columns=["Confidence Tier_method3"]) - region_assets["Current EPC Band"] = np.where( - region_assets["Confidence Tier"] == "5 - no similar property, needs survey to confirm", - "Unknown", region_assets["Current EPC Band"] - ) - if pd.isnull(region_assets["Current EPC Band"]).sum(): raise Exception("Something went wrong") From 7d209d5d8e07b4112bffcdcfc748d04cc299abe6 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 17 Nov 2024 16:28:43 +0000 Subject: [PATCH 11/31] creating loss and gain columns --- .../stonewater/Wave 3 Preparation.py | 48 +++++++++++++++---- 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index d2110de8..b36ae756 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1703,7 +1703,7 @@ def propsed_wave_3_sample(): region_assets["Confidence Tier"] = np.where( region_assets["Current EPC Band"].isin(["C", "B", "A"]), - "6 - property was surveyed", region_assets["Confidence Tier"] + "5 - property was surveyed", region_assets["Confidence Tier"] ) archetypes = region_assets[ @@ -1721,6 +1721,7 @@ def propsed_wave_3_sample(): (survey_results["Postal Region"] == region) ].groupby("Archetype ID")[["Current SAP Rating"]].mean().reset_index() region_surveyed["Current EPC Band"] = region_surveyed["Current SAP Rating"].apply(sap_to_epc) + region_surveyed = region_surveyed.drop(columns=["Current SAP Rating"]) region_assets = region_assets.merge( region_surveyed, @@ -1743,7 +1744,7 @@ def propsed_wave_3_sample(): # Handle EPC C region_assets["Confidence Tier"] = np.where( region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]), - "6 - EPC C or above", region_assets["Confidence Tier"] + "5 - EPC C or above", region_assets["Confidence Tier"] ) region_assets = region_assets.drop(columns=["Current EPC Band_method1"]) @@ -1773,7 +1774,8 @@ def propsed_wave_3_sample(): ) region_assets["Confidence Tier"] = np.where( - region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]), + region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]) & pd.isnull( + region_assets["Confidence Tier"]), "2 - same archetype", region_assets["Confidence Tier"] ) @@ -1786,8 +1788,8 @@ def propsed_wave_3_sample(): # We label EPC C properties region_assets["Confidence Tier"] = np.where( - region_assets["Current EPC Band"].isin(["C", "B", "A"]), - "6 - EPC C or above", region_assets["Confidence Tier"] + region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]), + "5 - EPC C or above", region_assets["Confidence Tier"] ) missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist() @@ -1823,7 +1825,7 @@ def propsed_wave_3_sample(): final_missed_matches.append( { "Address ID": a_id, - "Confidence Tier": "5 - no similar property, needs survey to confirm", + "Confidence Tier": "4 - no similar property, needs survey to confirm", "Current EPC Band": "Unknown" } ) @@ -1832,7 +1834,7 @@ def propsed_wave_3_sample(): expected_sap = surveyed_similar["Current SAP Rating"].mean() expected_epc = sap_to_epc(expected_sap) if expected_epc in ["C", "B", "A"]: - tier = "6 - EPC C or above" + tier = "5 - EPC C or above" else: tier = "3 - similar property" @@ -1861,12 +1863,42 @@ def propsed_wave_3_sample(): region_assets["Current EPC Band_method3"], region_assets["Current EPC Band"] ) - region_assets = region_assets.drop(columns=["Confidence Tier_method3"]) + region_assets = region_assets.drop(columns=["Confidence Tier_method3", "Current EPC Band_method3"]) if pd.isnull(region_assets["Current EPC Band"]).sum(): raise Exception("Something went wrong") results.append(region_assets) + results = pd.concat(results) + + # Create a pivot table for counts of Confidence Tier by Postal Region + geographic_summary = results.pivot_table( + index='Postal Region', + columns='Confidence Tier', + aggfunc='size', + fill_value=0 + ).reset_index() + + # We create the gain and loss columns + # Gain is the sum of these columns: + # '1 - Archetype surveyed', '1 - property was surveyed', + # '2 - same archetype', '3 - similar property', + # Loss is the sum of these columns: + # '4 - no similar property, needs survey to confirm', + # '5 - EPC C or above', '5 - property was surveyed' + geographic_summary["Gain"] = geographic_summary[ + ['1 - Archetype surveyed', '1 - property was surveyed', '2 - same archetype', '3 - similar property'] + ].sum(axis=1) + + geographic_summary["Loss"] = geographic_summary[ + ['4 - no similar property, needs survey to confirm', '5 - EPC C or above', '5 - property was surveyed'] + ].sum(axis=1) + + geographic_summary.sum() + + geographic_summary = geographic_summary.sort_values("Loss", ascending=True) + geographic_summary["Loss Cumulative Sum"] = geographic_summary["Loss"].cumsum() + # if __name__ == "__main__": # main() From a01ff1d8dedaaf78e8ce95b21305a6f1a430ae3e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 17 Nov 2024 16:45:10 +0000 Subject: [PATCH 12/31] tweaking postal region algorithm - may need to swap to postcode or street --- .../stonewater/Wave 3 Preparation.py | 44 ++++++++++++++----- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index b36ae756..20f771ec 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1803,22 +1803,43 @@ def propsed_wave_3_sample(): for a_id in missed_addressids: property = asset_list[asset_list["Address ID"] == a_id].squeeze() - # TODO: This is quite strict for the moment - we might want to relax this by creating reduced versions - # of the wall, roof and heating features, splitting them on the colons and taking the first part + if property["Property Type"].split(":")[0] in ["House", "Bungalow"]: + filter_property_types = ["House", "Bungalow"] + else: + filter_property_types = ["Flat"] + surveyed_similar = survey_results_with_original_features[ (survey_results_with_original_features["Postcode"] == property["Postcode"]) & - (survey_results_with_original_features["Property Type"] == property["Property Type"]) & - (survey_results_with_original_features["Wall Type"] == property["Wall Type"]) & - (survey_results_with_original_features["Roof Type"] == property["Roof Type"]) & - (survey_results_with_original_features["Heating"] == property["Heating"]) + ( + survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( + filter_property_types + ) + ) & + ( + survey_results_with_original_features["Wall Type"].str.split(":").str[0] == + property["Wall Type"].split(":")[0] + ) & + ( + survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + property["Roof Type"].split(":")[0] + ) & + ( + survey_results_with_original_features["Heating"].str.split(":").str[0] == + property["Heating"].split(":")[0] + ) ] if surveyed_similar.empty: surveyed_similar = survey_results_with_original_features[ (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) & - (survey_results_with_original_features["Property Type"] == property["Property Type"]) & - (survey_results_with_original_features["Wall Type"] == property["Wall Type"]) & - (survey_results_with_original_features["Roof Type"] == property["Roof Type"]) & - (survey_results_with_original_features["Heating"] == property["Heating"]) + (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( + filter_property_types + )) & + (survey_results_with_original_features["Wall Type"].str.split(":").str[0] == + property["Wall Type"].split(":")[0]) & + (survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + property["Roof Type"].split(":")[0]) & + (survey_results_with_original_features["Heating"].str.split(":").str[0] == + property["Heating"].split(":")[0]) ] if surveyed_similar.empty: @@ -1842,7 +1863,7 @@ def propsed_wave_3_sample(): { "Address ID": a_id, "Confidence Tier": tier, - "Current EPC Band": "Unknown" + "Current EPC Band": expected_epc } ) @@ -1899,6 +1920,7 @@ def propsed_wave_3_sample(): geographic_summary = geographic_summary.sort_values("Loss", ascending=True) geographic_summary["Loss Cumulative Sum"] = geographic_summary["Loss"].cumsum() + geographic_summary[geographic_summary["Loss Cumulative Sum"] <= 250]["Gain"].sum() # if __name__ == "__main__": # main() From 7d63c164045c6855ea6cb13091788a2ed7db2afb Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 17 Nov 2024 18:05:05 +0000 Subject: [PATCH 13/31] implemented linear programming to find maximal bid size --- .../stonewater/Wave 3 Preparation.py | 71 ++++++++++++++++--- .../requirements/requirements-wave-3-prep.txt | 1 + 2 files changed, 64 insertions(+), 8 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 20f771ec..c397f962 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3,9 +3,9 @@ import PyPDF2 import re import pandas as pd import numpy as np -from docutils.utils.math.tex2mathml_extern import blahtexml from tqdm import tqdm from collections import Counter +from scipy.optimize import linprog CUSTOMER_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater" SURVEY_FOLDERS = os.path.join(CUSTOMER_FOLDER_PATH, "StonewaterSurveys_{i}") @@ -1843,13 +1843,38 @@ def propsed_wave_3_sample(): ] if surveyed_similar.empty: - final_missed_matches.append( - { - "Address ID": a_id, - "Confidence Tier": "4 - no similar property, needs survey to confirm", - "Current EPC Band": "Unknown" - } - ) + + # We get an average based on the postcode + surveyed_similar = survey_results_with_original_features[ + (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) & + (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( + filter_property_types + )) + ] + if surveyed_similar.empty: + final_missed_matches.append( + { + "Address ID": a_id, + "Confidence Tier": "4 - no similar property, needs survey to confirm", + "Current EPC Band": "Unknown" + } + + ) + else: + expected_sap = surveyed_similar["Current SAP Rating"].mean() + expected_epc = sap_to_epc(expected_sap) + if expected_epc in ["C", "B", "A"]: + tier = "5 - EPC C or above" + else: + tier = "3 - similar property, relaxed conditions" + + final_missed_matches.append( + { + "Address ID": a_id, + "Confidence Tier": tier, + "Current EPC Band": expected_epc + } + ) continue # We take an average expected_sap = surveyed_similar["Current SAP Rating"].mean() @@ -1922,5 +1947,35 @@ def propsed_wave_3_sample(): geographic_summary["Loss Cumulative Sum"] = geographic_summary["Loss"].cumsum() geographic_summary[geographic_summary["Loss Cumulative Sum"] <= 250]["Gain"].sum() + geographic_summary[["Loss", "Gain"]].head() + + loss = geographic_summary["Loss"].values + gain = geographic_summary["Gain"].values + + # Define the coefficients for the objective function (negative because we maximize Gain) + c = -gain + + # Define constraints + A = [loss] # Only 1 constraint for now, total Loss + b = [250] # Maximum total Loss allowed + + # Bounds for each variable (select or not select each row, 0 <= x <= 1) + bounds = [(0, 1) for _ in gain] + + # Solve the problem using linprog with HiGHS solver + result = linprog(c, A_ub=A, b_ub=b, bounds=bounds, method='highs') + if not result.success: + raise Exception("Optimization failed") + + selected_rows = result.x.round().astype(int) # Rounded to 0 or 1 + optimal_gain = -result.fun + print(optimal_gain) + + # Select the rows that are selected + geographic_summary["Selected"] = selected_rows == 1 + geographic_summary[geographic_summary["Selected"]].sum() + bid_size = geographic_summary[geographic_summary["Selected"]][["Gain", "Loss"]].sum().sum() + print("Bid Size:", bid_size) + # if __name__ == "__main__": # main() diff --git a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt index 3ad5d2c1..09ba20bd 100644 --- a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt +++ b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt @@ -7,4 +7,5 @@ epc-api-python==1.0.2 usaddress==0.5.11 fuzzywuzzy==0.18.0 python-dotenv +scipy From eff80e637f73490c3f45d2ef0ffcc71a188e95cb Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 17 Nov 2024 19:10:23 +0000 Subject: [PATCH 14/31] implementing distance weighting --- .../stonewater/Wave 3 Preparation.py | 332 +++++++++++++----- 1 file changed, 248 insertions(+), 84 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index c397f962..3b44d560 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1635,8 +1635,9 @@ def propsed_wave_3_sample(): header=4 ) - # TODO: We drop 302 properties that are not priority postcodes - confirm w/ Stonewater - asset_list = asset_list[asset_list["Archetype ID"] != "NOT PRIORITY POSTCODE"] + # TODO: We drop 302 properties that are not priority postcodes - confirm w/ Stonewater and 7 properties missing + # UPRN + asset_list = asset_list[~asset_list["Archetype ID"].isin(["NOT PRIORITY POSTCODE", "MISSING UPRN"])] # Clean address ids asset_list = asset_list[~pd.isnull(asset_list["Address ID"])] asset_list = asset_list[asset_list["Address ID"] != "Address ID"] @@ -1648,7 +1649,7 @@ def propsed_wave_3_sample(): # Keep just the columns we need asset_list = asset_list[ - ["Address ID", "Archetype ID", "Postal Region", "Postcode", "Property Type", "Wall Type", "Roof Type", + ["UPRN", "Address ID", "Archetype ID", "Postal Region", "Postcode", "Property Type", "Wall Type", "Roof Type", "Heating"] ] @@ -1665,7 +1666,7 @@ def propsed_wave_3_sample(): survey_results["Postal Region"] = survey_results["Postcode"].str.split(" ").str[0] survey_results_with_original_features = survey_results.merge( - asset_list[["Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]], + asset_list[["UPRN", "Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]], on="Address ID", how="left" ) @@ -1673,6 +1674,45 @@ def propsed_wave_3_sample(): if survey_results_with_original_features.shape[0] != survey_results.shape[0]: raise ValueError("Something went wrong") + # We get longitude & Latitude + from utils.s3 import read_pickle_from_s3 + archetyping_spatial_features = read_pickle_from_s3( + bucket_name="retrofit-data-dev", s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl", + ) + archetyping_spatial_features = pd.concat(archetyping_spatial_features) + archetyping_spatial_features = archetyping_spatial_features[["UPRN", 'LATITUDE', 'LONGITUDE']].rename( + columns={"LATITUDE": "latitude", "LONGITUDE": "longitude"} + ) + # Merge them onto both datasets + asset_list = asset_list.merge( + archetyping_spatial_features, how="left", on="UPRN" + ) + if pd.isnull(asset_list["longitude"]).sum(): + raise ValueError("Something went wrong") + + survey_results_with_original_features = survey_results_with_original_features.merge( + archetyping_spatial_features, how="left", on="UPRN" + ) + if pd.isnull(survey_results_with_original_features["longitude"]).sum(): + raise ValueError("Something went wrong") + + def haversine(lat1, lon1, lat2, lon2): + # Radius of Earth in meters + R = 6371000 + + # Convert degrees to radians + lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2]) + + # Differences + dlat = lat2 - lat1 + dlon = lon2 - lon1 + + # Haversine formula + a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2 + c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a)) + distance = R * c + return distance + # Tier definitions # Tier 1: We have a property in the same postal region and same archetype that was surveyed and is below EPC D # Tier 2: We have a property in the same archetype that was surveyed and is below EPC D @@ -1716,6 +1756,7 @@ def propsed_wave_3_sample(): ][["Archetype ID", "Current EPC Band"]].drop_duplicates() if region_surveyed["Archetype ID"].duplicated().sum(): + blah1 region_surveyed = survey_results[ survey_results["Archetype ID"].isin(archetypes) & (survey_results["Postal Region"] == region) @@ -1755,23 +1796,46 @@ def propsed_wave_3_sample(): survey_results["Archetype ID"].isin(missed_archetypes) ][["Archetype ID", "Current EPC Band"]].drop_duplicates() - # TODO - We could average the property?? And call it borderline, call out it was averaged!!! - # We could also find the nearest property to it, with similar wall, roof, heating? - # Can use long/lag to distance calc. We have this data from previous - if archetype_surveyed["Archetype ID"].duplicated().sum(): - archetype_surveyed = survey_results[ - survey_results["Archetype ID"].isin(missed_archetypes) - ].groupby("Archetype ID")[["Current SAP Rating"]].mean().reset_index() - archetype_surveyed["Current EPC Band"] = archetype_surveyed["Current SAP Rating"].apply(sap_to_epc) - archetype_surveyed = archetype_surveyed.drop(columns=["Current SAP Rating"]) - region_assets = region_assets.merge( - archetype_surveyed, - on="Archetype ID", - how="left", - suffixes=("", "_method2") - ) + archetype_surveyed = [] + for arch_id in missed_archetypes: + for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows(): + archetype_data = survey_results_with_original_features[ + survey_results["Archetype ID"] == arch_id + ].copy() + if archetype_data.empty: + continue + archetype_data["distance_meters"] = haversine( + lat1=property.latitude, lon1=property.longitude, + lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values + ) + expected_sap = np.average( + archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1) + ) + expected_epc = sap_to_epc(expected_sap) + archetype_surveyed.append( + { + "Archetype ID": arch_id, + "Address ID": property["Address ID"], + "Current EPC Band": expected_epc + } + ) + archetype_surveyed = pd.DataFrame(archetype_surveyed) + region_assets = region_assets.merge( + archetype_surveyed, + on=["Archetype ID", "Address ID"], + how="left", + suffixes=("", "_method2") + ) + else: + + region_assets = region_assets.merge( + archetype_surveyed, + on="Archetype ID", + how="left", + suffixes=("", "_method2") + ) region_assets["Confidence Tier"] = np.where( region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]) & pd.isnull( @@ -1792,6 +1856,16 @@ def propsed_wave_3_sample(): "5 - EPC C or above", region_assets["Confidence Tier"] ) + region_assets["Confidence Tier"] = np.where( + region_assets["Archetype ID"] == "EPC C OR ABOVE", + "5 - EPC C or above", region_assets["Confidence Tier"] + ) + + region_assets["Current EPC Band"] = np.where( + region_assets["Archetype ID"] == "EPC C OR ABOVE", + "C", region_assets["Current EPC Band"] + ) + missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist() if not missed_addressids: @@ -1803,17 +1877,10 @@ def propsed_wave_3_sample(): for a_id in missed_addressids: property = asset_list[asset_list["Address ID"] == a_id].squeeze() - if property["Property Type"].split(":")[0] in ["House", "Bungalow"]: - filter_property_types = ["House", "Bungalow"] - else: - filter_property_types = ["Flat"] - - surveyed_similar = survey_results_with_original_features[ - (survey_results_with_original_features["Postcode"] == property["Postcode"]) & + surveyed = survey_results_with_original_features[ ( - survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( - filter_property_types - ) + survey_results_with_original_features["Property Type"].str.split(":").str[0] == + property["Property Type"].split(":")[0] ) & ( survey_results_with_original_features["Wall Type"].str.split(":").str[0] == @@ -1827,62 +1894,38 @@ def propsed_wave_3_sample(): survey_results_with_original_features["Heating"].str.split(":").str[0] == property["Heating"].split(":")[0] ) - ] - if surveyed_similar.empty: - surveyed_similar = survey_results_with_original_features[ - (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) & - (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( - filter_property_types - )) & - (survey_results_with_original_features["Wall Type"].str.split(":").str[0] == - property["Wall Type"].split(":")[0]) & - (survey_results_with_original_features["Roof Type"].str.split(":").str[0] == - property["Roof Type"].split(":")[0]) & - (survey_results_with_original_features["Heating"].str.split(":").str[0] == - property["Heating"].split(":")[0]) - ] + ].copy() - if surveyed_similar.empty: + if surveyed.empty: + blah3 - # We get an average based on the postcode - surveyed_similar = survey_results_with_original_features[ - (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) & - (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( - filter_property_types - )) - ] - if surveyed_similar.empty: - final_missed_matches.append( - { - "Address ID": a_id, - "Confidence Tier": "4 - no similar property, needs survey to confirm", - "Current EPC Band": "Unknown" - } + # Calculate distance + surveyed["distance_meters"] = haversine( + lat1=property["latitude"], lon1=property["longitude"], + lat2=surveyed["latitude"].values, lon2=surveyed["longitude"].values + ) + surveyed = surveyed.sort_values("distance_meters", ascending=True) - ) - else: - expected_sap = surveyed_similar["Current SAP Rating"].mean() - expected_epc = sap_to_epc(expected_sap) - if expected_epc in ["C", "B", "A"]: - tier = "5 - EPC C or above" - else: - tier = "3 - similar property, relaxed conditions" + # Check if we have a postcode match check if surveyed postcode is the same as the property postcode + if any(surveyed["Postcode"] == property["Postcode"]): + surveyed_similar = surveyed[surveyed["Postcode"] == property["Postcode"]] - final_missed_matches.append( - { - "Address ID": a_id, - "Confidence Tier": tier, - "Current EPC Band": expected_epc - } - ) - continue - # We take an average - expected_sap = surveyed_similar["Current SAP Rating"].mean() + if any(surveyed["Postal Region"] == property["Postal Region"]): + surveyed_similar = surveyed[surveyed["Postal Region"] == property["Postal Region"]] + + # Take the 5 nearest + surveyed_similar = surveyed_similar.head(5) + + # perform a weighted mean of SAP rating - the closer the better + expected_sap = np.average( + surveyed_similar["Current SAP Rating"], weights=1 / (surveyed_similar["distance_meters"] + 1) + ) expected_epc = sap_to_epc(expected_sap) + if expected_epc in ["C", "B", "A"]: tier = "5 - EPC C or above" else: - tier = "3 - similar property" + tier = "3 - similar property, weighted on distance" final_missed_matches.append( { @@ -1891,6 +1934,121 @@ def propsed_wave_3_sample(): "Current EPC Band": expected_epc } ) + continue + + # if property["Property Type"].split(":")[0] in ["House", "Bungalow"]: + # filter_property_types = ["House", "Bungalow"] + # else: + # filter_property_types = ["Flat"] + # + # surveyed_similar = survey_results_with_original_features[ + # (survey_results_with_original_features["Postcode"] == property["Postcode"]) & + # ( + # survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( + # filter_property_types + # ) + # ) & + # ( + # survey_results_with_original_features["Wall Type"].str.split(":").str[0] == + # property["Wall Type"].split(":")[0] + # ) & + # ( + # survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + # property["Roof Type"].split(":")[0] + # ) & + # ( + # survey_results_with_original_features["Heating"].str.split(":").str[0] == + # property["Heating"].split(":")[0] + # ) + # ] + # if surveyed_similar.empty: + # surveyed_similar = survey_results_with_original_features[ + # (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) & + # (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( + # filter_property_types + # )) & + # (survey_results_with_original_features["Wall Type"].str.split(":").str[0] == + # property["Wall Type"].split(":")[0]) & + # (survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + # property["Roof Type"].split(":")[0]) & + # (survey_results_with_original_features["Heating"].str.split(":").str[0] == + # property["Heating"].split(":")[0]) + # ] + # + # if surveyed_similar.empty: + # + # # We get an average based on the postcode + # surveyed_similar = survey_results_with_original_features[ + # (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) & + # (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( + # filter_property_types + # )) + # ] + # if surveyed_similar.empty: + # surveyed_similar_entire_population = survey_results_with_original_features[ + # ( + # survey_results_with_original_features["Property Type"].str.split(":").str[0] == property[ + # "Property Type"].split(":")[0] + # ) & + # ( + # survey_results_with_original_features["Wall Type"].str.split(":").str[0] == + # property["Wall Type"].split(":")[0] + # ) & + # ( + # survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + # property["Roof Type"].split(":")[0] + # ) & + # ( + # survey_results_with_original_features["Heating"].str.split(":").str[0] == + # property["Heating"].split(":")[0] + # ) + # ] + # + # # We order them by distance on postcode + # + # # Average + # expected_sap = surveyed_similar_entire_population["Current SAP Rating"].mean() + # expected_epc = sap_to_epc(expected_sap) + # + # final_missed_matches.append( + # { + # "Address ID": a_id, + # "Confidence Tier": "3 - similar property, all areas searched", + # "Current EPC Band": expected_epc + # } + # + # ) + # else: + # expected_sap = surveyed_similar["Current SAP Rating"].mean() + # expected_epc = sap_to_epc(expected_sap) + # if expected_epc in ["C", "B", "A"]: + # tier = "5 - EPC C or above" + # else: + # tier = "3 - similar property, relaxed conditions" + # + # final_missed_matches.append( + # { + # "Address ID": a_id, + # "Confidence Tier": tier, + # "Current EPC Band": expected_epc + # } + # ) + # continue + # # We take an average + # expected_sap = surveyed_similar["Current SAP Rating"].mean() + # expected_epc = sap_to_epc(expected_sap) + # if expected_epc in ["C", "B", "A"]: + # tier = "5 - EPC C or above" + # else: + # tier = "3 - similar property" + # + # final_missed_matches.append( + # { + # "Address ID": a_id, + # "Confidence Tier": tier, + # "Current EPC Band": expected_epc + # } + # ) final_missed_matches = pd.DataFrame(final_missed_matches) @@ -1928,27 +2086,33 @@ def propsed_wave_3_sample(): # We create the gain and loss columns # Gain is the sum of these columns: - # '1 - Archetype surveyed', '1 - property was surveyed', - # '2 - same archetype', '3 - similar property', + # '1 - Archetype surveyed', + # '1 - property was surveyed', + # '2 - same archetype', + # '3 - similar property', + # '3 - similar property, all areas searched', + # '3 - similar property, relaxed conditions' + # # Loss is the sum of these columns: # '4 - no similar property, needs survey to confirm', # '5 - EPC C or above', '5 - property was surveyed' geographic_summary["Gain"] = geographic_summary[ - ['1 - Archetype surveyed', '1 - property was surveyed', '2 - same archetype', '3 - similar property'] + [ + '1 - Archetype surveyed', '1 - property was surveyed', '2 - same archetype', '3 - similar property', + '3 - similar property, all areas searched', '3 - similar property, relaxed conditions' + ] ].sum(axis=1) geographic_summary["Loss"] = geographic_summary[ - ['4 - no similar property, needs survey to confirm', '5 - EPC C or above', '5 - property was surveyed'] + ['5 - EPC C or above', '5 - property was surveyed'] ].sum(axis=1) - geographic_summary.sum() + print(geographic_summary.sum()) geographic_summary = geographic_summary.sort_values("Loss", ascending=True) geographic_summary["Loss Cumulative Sum"] = geographic_summary["Loss"].cumsum() geographic_summary[geographic_summary["Loss Cumulative Sum"] <= 250]["Gain"].sum() - geographic_summary[["Loss", "Gain"]].head() - loss = geographic_summary["Loss"].values gain = geographic_summary["Gain"].values From a630fe05c485aca2c5509748eecb5544ddc78dbe Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 17 Nov 2024 19:46:17 +0000 Subject: [PATCH 15/31] fixing unhandled cases in matching algorithm --- .../stonewater/Wave 3 Preparation.py | 92 ++++++++++++++++--- 1 file changed, 78 insertions(+), 14 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 3b44d560..460aa8ee 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1756,20 +1756,44 @@ def propsed_wave_3_sample(): ][["Archetype ID", "Current EPC Band"]].drop_duplicates() if region_surveyed["Archetype ID"].duplicated().sum(): - blah1 - region_surveyed = survey_results[ - survey_results["Archetype ID"].isin(archetypes) & - (survey_results["Postal Region"] == region) - ].groupby("Archetype ID")[["Current SAP Rating"]].mean().reset_index() - region_surveyed["Current EPC Band"] = region_surveyed["Current SAP Rating"].apply(sap_to_epc) - region_surveyed = region_surveyed.drop(columns=["Current SAP Rating"]) + region_surveyed = [] + for arch_id in archetypes: + for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows(): + archetype_data = survey_results_with_original_features[ + survey_results["Archetype ID"] == arch_id + ].copy() + if archetype_data.empty: + continue + archetype_data["distance_meters"] = haversine( + lat1=property.latitude, lon1=property.longitude, + lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values + ) + expected_sap = np.average( + archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1) + ) + expected_epc = sap_to_epc(expected_sap) + region_surveyed.append( + { + "Archetype ID": arch_id, + "Address ID": property["Address ID"], + "Current EPC Band": expected_epc + } + ) - region_assets = region_assets.merge( - region_surveyed, - on="Archetype ID", - how="left", - suffixes=("", "_method1") - ) + region_surveyed = pd.DataFrame(region_surveyed) + region_assets = region_assets.merge( + region_surveyed, + on=["Archetype ID", "Address ID"], + how="left", + suffixes=("", "_method1") + ) + else: + region_assets = region_assets.merge( + region_surveyed, + on="Archetype ID", + how="left", + suffixes=("", "_method1") + ) # Label the tier 1 properties region_assets["Confidence Tier"] = np.where( @@ -1897,7 +1921,47 @@ def propsed_wave_3_sample(): ].copy() if surveyed.empty: - blah3 + # In this case, we do one additional check where we filter on everything the same apart from heating, + # where we do a slightly more rough match + surveyed = survey_results_with_original_features[ + ( + survey_results_with_original_features["Property Type"].str.split(":").str[0] == + property["Property Type"].split(":")[0] + ) & + ( + survey_results_with_original_features["Wall Type"].str.split(":").str[0] == + property["Wall Type"].split(":")[0] + ) & + ( + survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + property["Roof Type"].split(":")[0] + ) + ].copy() + + if "Electric" in property["Heating"]: + # Take other electric heating systems + surveyed = surveyed[surveyed["Heating"].str.contains("Electric")] + elif property["Heating"] == "Community Heating Systems: Community boilers only (RdSAP)": + # Take other community heating systems + surveyed = surveyed[surveyed["Heating"].str.contains("Community")] + elif property["Heating"] == 'Heat Pump: (from database)': + # Take other heat pumps + surveyed = surveyed[surveyed["Heating"].str.contains("Heat Pump")] + elif property["Heating"] == "Solid fuel room heaters: Open fire in grate": + # Take other properties with room heaters + surveyed = surveyed[surveyed["Heating"].str.contains("room heaters")] + else: + raise Exception("Fix me") + + if surveyed.empty: + final_missed_matches.append( + { + "Address ID": a_id, + "Confidence Tier": "4 - no similar property, needs survey to confirm", + "Current EPC Band": "Needs Survey" + } + ) + continue # Calculate distance surveyed["distance_meters"] = haversine( From 1b38832e27abcbebe575f4be867a41e4ae772949 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 17 Nov 2024 20:13:19 +0000 Subject: [PATCH 16/31] 2044 properties added --- .../stonewater/Wave 3 Preparation.py | 148 ++++++++++++++---- 1 file changed, 117 insertions(+), 31 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 460aa8ee..6f98c9fd 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1938,6 +1938,27 @@ def propsed_wave_3_sample(): ) ].copy() + if surveyed.empty: + if property["Property Type"].split(":")[0] in ["House", "Bungalow", "Maisonette"]: + filter_property_types = ["House", "Bungalow", ] + else: + filter_property_types = ["Flat"] + surveyed = survey_results_with_original_features[ + ( + survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( + filter_property_types + ) + ) & + ( + survey_results_with_original_features["Wall Type"].str.split(":").str[0] == + property["Wall Type"].split(":")[0] + ) & + ( + survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + property["Roof Type"].split(":")[0] + ) + ].copy() + if "Electric" in property["Heating"]: # Take other electric heating systems surveyed = surveyed[surveyed["Heating"].str.contains("Electric")] @@ -1950,6 +1971,9 @@ def propsed_wave_3_sample(): elif property["Heating"] == "Solid fuel room heaters: Open fire in grate": # Take other properties with room heaters surveyed = surveyed[surveyed["Heating"].str.contains("room heaters")] + elif "Boiler" in property["Heating"]: + # Take other properties with boilers + surveyed = surveyed[surveyed["Heating"].str.contains("Boiler")] else: raise Exception("Fix me") @@ -1972,17 +1996,29 @@ def propsed_wave_3_sample(): # Check if we have a postcode match check if surveyed postcode is the same as the property postcode if any(surveyed["Postcode"] == property["Postcode"]): - surveyed_similar = surveyed[surveyed["Postcode"] == property["Postcode"]] + surveyed = surveyed[surveyed["Postcode"] == property["Postcode"]] if any(surveyed["Postal Region"] == property["Postal Region"]): - surveyed_similar = surveyed[surveyed["Postal Region"] == property["Postal Region"]] + surveyed = surveyed[surveyed["Postal Region"] == property["Postal Region"]] # Take the 5 nearest - surveyed_similar = surveyed_similar.head(5) + surveyed = surveyed.head(5) + + # # We allow a max distance of 10km + # surveyed = surveyed[surveyed["distance_meters"] < 10000] + # if surveyed.empty: + # final_missed_matches.append( + # { + # "Address ID": a_id, + # "Confidence Tier": "4 - no similar property, needs survey to confirm", + # "Current EPC Band": "Needs Survey" + # } + # ) + # continue # perform a weighted mean of SAP rating - the closer the better expected_sap = np.average( - surveyed_similar["Current SAP Rating"], weights=1 / (surveyed_similar["distance_meters"] + 1) + surveyed["Current SAP Rating"], weights=1 / (surveyed["distance_meters"] + 1) ) expected_epc = sap_to_epc(expected_sap) @@ -2153,23 +2189,21 @@ def propsed_wave_3_sample(): # '1 - Archetype surveyed', # '1 - property was surveyed', # '2 - same archetype', - # '3 - similar property', - # '3 - similar property, all areas searched', - # '3 - similar property, relaxed conditions' + # '3 - similar property, weighted on distance' + + gain_columns = [ + '1 - Archetype surveyed', '1 - property was surveyed', '2 - same archetype', + '3 - similar property, weighted on distance' + ] # # Loss is the sum of these columns: # '4 - no similar property, needs survey to confirm', # '5 - EPC C or above', '5 - property was surveyed' - geographic_summary["Gain"] = geographic_summary[ - [ - '1 - Archetype surveyed', '1 - property was surveyed', '2 - same archetype', '3 - similar property', - '3 - similar property, all areas searched', '3 - similar property, relaxed conditions' - ] - ].sum(axis=1) - geographic_summary["Loss"] = geographic_summary[ - ['5 - EPC C or above', '5 - property was surveyed'] - ].sum(axis=1) + loss_columns = ['4 - no similar property, needs survey to confirm', '5 - EPC C or above', + '5 - property was surveyed'] + geographic_summary["Gain"] = geographic_summary[gain_columns].sum(axis=1) + geographic_summary["Loss"] = geographic_summary[loss_columns].sum(axis=1) print(geographic_summary.sum()) @@ -2180,30 +2214,82 @@ def propsed_wave_3_sample(): loss = geographic_summary["Loss"].values gain = geographic_summary["Gain"].values - # Define the coefficients for the objective function (negative because we maximize Gain) - c = -gain + def optimise(gain, loss, max_loss=250): - # Define constraints - A = [loss] # Only 1 constraint for now, total Loss - b = [250] # Maximum total Loss allowed + # Define the coefficients for the objective function (negative because we maximize Gain) + c = -gain - # Bounds for each variable (select or not select each row, 0 <= x <= 1) - bounds = [(0, 1) for _ in gain] + # Define constraints + A = [loss] # Only 1 constraint for now, total Loss + b = [max_loss] # Maximum total Loss allowed - # Solve the problem using linprog with HiGHS solver - result = linprog(c, A_ub=A, b_ub=b, bounds=bounds, method='highs') - if not result.success: - raise Exception("Optimization failed") + # Bounds for each variable (select or not select each row, 0 <= x <= 1) + bounds = [(0, 1) for _ in gain] - selected_rows = result.x.round().astype(int) # Rounded to 0 or 1 - optimal_gain = -result.fun - print(optimal_gain) + # Solve the problem using linprog with HiGHS solver + result = linprog(c, A_ub=A, b_ub=b, bounds=bounds, method='highs') + if not result.success: + raise Exception("Optimization failed") + + selected_rows = result.x.round().astype(int) # Rounded to 0 or 1 + optimal_gain = -result.fun + + return selected_rows, optimal_gain + + selected_rows, _ = optimise(gain, loss, 250) # Select the rows that are selected geographic_summary["Selected"] = selected_rows == 1 geographic_summary[geographic_summary["Selected"]].sum() - bid_size = geographic_summary[geographic_summary["Selected"]][["Gain", "Loss"]].sum().sum() + + region_totals = geographic_summary[ + geographic_summary["Selected"] + ][["Gain", "Loss"]].sum() + + # We now see if there are any postcodes that have no loss that can be added + unselected_regions = geographic_summary[~geographic_summary["Selected"]]["Postal Region"].values + + postcode_summary = results.pivot_table( + index='Postcode', + columns='Confidence Tier', + aggfunc='size', + fill_value=0 + ).reset_index() + postcode_summary = postcode_summary.merge( + results[["Postcode", "Postal Region"]].drop_duplicates(), + how="left", on="Postcode" + ) + + postcode_summary_unselected_regions = postcode_summary[ + postcode_summary["Postcode"].str.split(" ").str[0].isin(unselected_regions) + ].copy() + + postcode_summary_unselected_regions["Gain"] = postcode_summary_unselected_regions[gain_columns].sum(axis=1) + postcode_summary_unselected_regions["Loss"] = postcode_summary_unselected_regions[loss_columns].sum(axis=1) + + # Remaining loss allowed + remaining_loss_constraint = 250 - region_totals["Loss"] + postcode_selected_rows, _ = optimise( + gain=postcode_summary_unselected_regions["Gain"].values, + loss=postcode_summary_unselected_regions["Loss"].values, + max_loss=int(remaining_loss_constraint) + ) + + postcode_summary_unselected_regions["Selected"] = postcode_selected_rows == 1 + postcode_summary_unselected_regions[postcode_summary_unselected_regions["Selected"]][["Gain", "Loss"]].sum() + + postcode_optimised_additional_properties = postcode_summary_unselected_regions[ + postcode_summary_unselected_regions["Selected"] + ] + + postcode_totals = postcode_optimised_additional_properties[["Gain", "Loss"]].sum() + + bid_size = region_totals.sum() + postcode_totals.sum() print("Bid Size:", bid_size) + total_epc_d_or_below = region_totals["Gain"] + postcode_totals["Gain"] + print("Total EPC D or below:", total_epc_d_or_below) + total_epc_c = region_totals["Loss"] + postcode_totals["Loss"] + print("Total EPC C or above:", total_epc_c) # if __name__ == "__main__": # main() From 67f97feb18829a4a2d327335a4a6ed8c8c06e495 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 17 Nov 2024 22:33:42 +0000 Subject: [PATCH 17/31] messing around with street match --- .../stonewater/Wave 3 Preparation.py | 105 ++++++++++++------ 1 file changed, 74 insertions(+), 31 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 6f98c9fd..5ebb06e2 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1637,7 +1637,7 @@ def propsed_wave_3_sample(): # TODO: We drop 302 properties that are not priority postcodes - confirm w/ Stonewater and 7 properties missing # UPRN - asset_list = asset_list[~asset_list["Archetype ID"].isin(["NOT PRIORITY POSTCODE", "MISSING UPRN"])] + asset_list = asset_list[~asset_list["Archetype ID"].isin(["MISSING UPRN"])] # Clean address ids asset_list = asset_list[~pd.isnull(asset_list["Address ID"])] asset_list = asset_list[asset_list["Address ID"] != "Address ID"] @@ -1645,12 +1645,13 @@ def propsed_wave_3_sample(): # Create the postal region, taking the first part of the postcode asset_list["Postal Region"] = asset_list["Postcode"].str.split(" ").str[0] + asset_list["Street and Region"] = asset_list["Street name"] + " " + asset_list["Postal Region"] unique_postal_regions = asset_list["Postal Region"].unique() # Keep just the columns we need asset_list = asset_list[ - ["UPRN", "Address ID", "Archetype ID", "Postal Region", "Postcode", "Property Type", "Wall Type", "Roof Type", - "Heating"] + ["UPRN", "Address ID", "Archetype ID", "Postal Region", "Postcode", "Street and Region", + "Property Type", "Wall Type", "Roof Type", "Heating"] ] survey_results = pd.read_excel( @@ -1853,7 +1854,6 @@ def propsed_wave_3_sample(): suffixes=("", "_method2") ) else: - region_assets = region_assets.merge( archetype_surveyed, on="Archetype ID", @@ -1903,20 +1903,20 @@ def propsed_wave_3_sample(): surveyed = survey_results_with_original_features[ ( - survey_results_with_original_features["Property Type"].str.split(":").str[0] == - property["Property Type"].split(":")[0] + survey_results_with_original_features["Property Type"] == + property["Property Type"] ) & ( - survey_results_with_original_features["Wall Type"].str.split(":").str[0] == - property["Wall Type"].split(":")[0] + survey_results_with_original_features["Wall Type"] == + property["Wall Type"] ) & ( - survey_results_with_original_features["Roof Type"].str.split(":").str[0] == - property["Roof Type"].split(":")[0] + survey_results_with_original_features["Roof Type"] == + property["Roof Type"] ) & ( - survey_results_with_original_features["Heating"].str.split(":").str[0] == - property["Heating"].split(":")[0] + survey_results_with_original_features["Heating"] == + property["Heating"] ) ].copy() @@ -1962,7 +1962,10 @@ def propsed_wave_3_sample(): if "Electric" in property["Heating"]: # Take other electric heating systems surveyed = surveyed[surveyed["Heating"].str.contains("Electric")] - elif property["Heating"] == "Community Heating Systems: Community boilers only (RdSAP)": + elif property["Heating"] in [ + "Community Heating Systems: Community boilers only (RdSAP)", + "Community Heating Systems: Community CHP and boilers (RdSAP)" + ]: # Take other community heating systems surveyed = surveyed[surveyed["Heating"].str.contains("Community")] elif property["Heating"] == 'Heat Pump: (from database)': @@ -2001,8 +2004,8 @@ def propsed_wave_3_sample(): if any(surveyed["Postal Region"] == property["Postal Region"]): surveyed = surveyed[surveyed["Postal Region"] == property["Postal Region"]] - # Take the 5 nearest - surveyed = surveyed.head(5) + # Take the 3 nearest + surveyed = surveyed.head(3) # # We allow a max distance of 10km # surveyed = surveyed[surveyed["distance_meters"] < 10000] @@ -2176,6 +2179,9 @@ def propsed_wave_3_sample(): results = pd.concat(results) + # home = results[results["Confidence Tier"] == "5 - EPC C or above"].sample(1) + # region = home["Postal Region"].values[0] + # Create a pivot table for counts of Confidence Tier by Postal Region geographic_summary = results.pivot_table( index='Postal Region', @@ -2192,7 +2198,9 @@ def propsed_wave_3_sample(): # '3 - similar property, weighted on distance' gain_columns = [ - '1 - Archetype surveyed', '1 - property was surveyed', '2 - same archetype', + '1 - Archetype surveyed', + '1 - property was surveyed', + '2 - same archetype', '3 - similar property, weighted on distance' ] # @@ -2200,8 +2208,11 @@ def propsed_wave_3_sample(): # '4 - no similar property, needs survey to confirm', # '5 - EPC C or above', '5 - property was surveyed' - loss_columns = ['4 - no similar property, needs survey to confirm', '5 - EPC C or above', - '5 - property was surveyed'] + loss_columns = [ + '4 - no similar property, needs survey to confirm', + '5 - EPC C or above', + '5 - property was surveyed' + ] geographic_summary["Gain"] = geographic_summary[gain_columns].sum(axis=1) geographic_summary["Loss"] = geographic_summary[loss_columns].sum(axis=1) @@ -2249,26 +2260,30 @@ def propsed_wave_3_sample(): # We now see if there are any postcodes that have no loss that can be added unselected_regions = geographic_summary[~geographic_summary["Selected"]]["Postal Region"].values + # TODO: Try on street + postcode_summary = results.pivot_table( - index='Postcode', + index='Street and Region', columns='Confidence Tier', aggfunc='size', fill_value=0 ).reset_index() - postcode_summary = postcode_summary.merge( - results[["Postcode", "Postal Region"]].drop_duplicates(), - how="left", on="Postcode" - ) - - postcode_summary_unselected_regions = postcode_summary[ - postcode_summary["Postcode"].str.split(" ").str[0].isin(unselected_regions) - ].copy() + # postcode_summary = postcode_summary.merge( + # results[["Postcode", "Postal Region"]].drop_duplicates(), + # how="left", on="Postcode" + # ) + # + postcode_summary_unselected_regions = postcode_summary.copy() + # postcode_summary_unselected_regions = postcode_summary[ + # postcode_summary["Postcode"].str.split(" ").str[0].isin(unselected_regions) + # ].copy() postcode_summary_unselected_regions["Gain"] = postcode_summary_unselected_regions[gain_columns].sum(axis=1) postcode_summary_unselected_regions["Loss"] = postcode_summary_unselected_regions[loss_columns].sum(axis=1) # Remaining loss allowed - remaining_loss_constraint = 250 - region_totals["Loss"] + # remaining_loss_constraint = 230 - region_totals["Loss"] + remaining_loss_constraint = 250 postcode_selected_rows, _ = optimise( gain=postcode_summary_unselected_regions["Gain"].values, loss=postcode_summary_unselected_regions["Loss"].values, @@ -2284,12 +2299,40 @@ def propsed_wave_3_sample(): postcode_totals = postcode_optimised_additional_properties[["Gain", "Loss"]].sum() - bid_size = region_totals.sum() + postcode_totals.sum() + bid_size = postcode_totals.sum() print("Bid Size:", bid_size) - total_epc_d_or_below = region_totals["Gain"] + postcode_totals["Gain"] + total_epc_d_or_below = postcode_totals["Gain"] print("Total EPC D or below:", total_epc_d_or_below) - total_epc_c = region_totals["Loss"] + postcode_totals["Loss"] + total_epc_c = postcode_totals["Loss"] print("Total EPC C or above:", total_epc_c) + # Total needing a survey + total_needing_survey = postcode_optimised_additional_properties[ + "4 - no similar property, needs survey to confirm" + ].sum() + print("Total needing survey:", total_needing_survey) + + # Look for postcodes that have no loss + unselected_streets = postcode_summary_unselected_regions[ + ~postcode_summary_unselected_regions["Selected"] + ]["Street and Region"].values + + postcode_summary2 = results[ + results["Street and Region"].isin(unselected_streets) + ].pivot_table( + index='Postcode', + columns='Confidence Tier', + aggfunc='size', + fill_value=0 + ).reset_index() + + postcode_summary2["Gain"] = postcode_summary2[gain_columns].sum(axis=1) + postcode_summary2["Loss"] = postcode_summary2[loss_columns].sum(axis=1) + + no_loss_postcodes = postcode_summary2[postcode_summary2["Loss"] == 0].sort_values("Gain", ascending=False) + total_bid_size = bid_size + no_loss_postcodes["Gain"].sum() + print(total_bid_size) + + z = results[results["Confidence Tier"] == "5 - EPC C or above"] # if __name__ == "__main__": # main() From efba61c6ac52740d70c51864ea49c0d5623b353d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 17 Nov 2024 22:51:24 +0000 Subject: [PATCH 18/31] tweaking --- .../stonewater/Wave 3 Preparation.py | 121 ++++++++++++------ 1 file changed, 83 insertions(+), 38 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 5ebb06e2..974cd908 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1719,6 +1719,72 @@ def propsed_wave_3_sample(): # Tier 2: We have a property in the same archetype that was surveyed and is below EPC D # + def match_property_to_surveyed(property, survey_results_with_original_features): + surveyed = survey_results_with_original_features[ + ( + survey_results_with_original_features["Property Type"] == + property["Property Type"] + ) & + ( + survey_results_with_original_features["Wall Type"] == + property["Wall Type"] + ) & + ( + survey_results_with_original_features["Roof Type"] == + property["Roof Type"] + ) & + ( + survey_results_with_original_features["Heating"] == + property["Heating"] + ) + ].copy() + + if not surveyed.empty: + return surveyed + + surveyed = survey_results_with_original_features[ + ( + survey_results_with_original_features["Property Type"] == + property["Property Type"] + ) & + ( + survey_results_with_original_features["Wall Type"] == + property["Wall Type"] + ) & + ( + survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + property["Roof Type"].split(":")[0] + ) & + ( + survey_results_with_original_features["Heating"] == + property["Heating"] + ) + ].copy() + + if not surveyed.empty: + return surveyed + + surveyed = survey_results_with_original_features[ + ( + survey_results_with_original_features["Property Type"] == + property["Property Type"] + ) & + ( + survey_results_with_original_features["Wall Type"] == + property["Wall Type"] + ) & + ( + survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + property["Roof Type"].split(":")[0] + ) & + ( + survey_results_with_original_features["Heating"].str.split(":").str[0] == + property["Heating"].split(":")[0] + ) + ].copy() + + return surveyed + results = [] for region in tqdm(unique_postal_regions): # Take all of the properties in that region @@ -1757,6 +1823,7 @@ def propsed_wave_3_sample(): ][["Archetype ID", "Current EPC Band"]].drop_duplicates() if region_surveyed["Archetype ID"].duplicated().sum(): + region_surveyed = [] for arch_id in archetypes: for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows(): @@ -1765,6 +1832,12 @@ def propsed_wave_3_sample(): ].copy() if archetype_data.empty: continue + if archetype_data.shape[0] > 1: + # Look for an exact match, or as close as possible + archetype_data_filtered = match_property_to_surveyed(property, archetype_data) + if not archetype_data_filtered.empty: + archetype_data = archetype_data_filtered + archetype_data["distance_meters"] = haversine( lat1=property.latitude, lon1=property.longitude, lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values @@ -1899,28 +1972,15 @@ def propsed_wave_3_sample(): # This means that this archetype was never surveyed and so we need to find a sufficiently similar property final_missed_matches = [] for a_id in missed_addressids: + + match_type = "3 - compared to similar properties" + property = asset_list[asset_list["Address ID"] == a_id].squeeze() - surveyed = survey_results_with_original_features[ - ( - survey_results_with_original_features["Property Type"] == - property["Property Type"] - ) & - ( - survey_results_with_original_features["Wall Type"] == - property["Wall Type"] - ) & - ( - survey_results_with_original_features["Roof Type"] == - property["Roof Type"] - ) & - ( - survey_results_with_original_features["Heating"] == - property["Heating"] - ) - ].copy() + surveyed = match_property_to_surveyed(property, survey_results_with_original_features) if surveyed.empty: + match_type = "3 - compared to similar properties, relaxed" # In this case, we do one additional check where we filter on everything the same apart from heating, # where we do a slightly more rough match surveyed = survey_results_with_original_features[ @@ -2026,14 +2086,12 @@ def propsed_wave_3_sample(): expected_epc = sap_to_epc(expected_sap) if expected_epc in ["C", "B", "A"]: - tier = "5 - EPC C or above" - else: - tier = "3 - similar property, weighted on distance" + match_type = "5 - EPC C or above" final_missed_matches.append( { "Address ID": a_id, - "Confidence Tier": tier, + "Confidence Tier": match_type, "Current EPC Band": expected_epc } ) @@ -2197,22 +2255,9 @@ def propsed_wave_3_sample(): # '2 - same archetype', # '3 - similar property, weighted on distance' - gain_columns = [ - '1 - Archetype surveyed', - '1 - property was surveyed', - '2 - same archetype', - '3 - similar property, weighted on distance' - ] - # - # Loss is the sum of these columns: - # '4 - no similar property, needs survey to confirm', - # '5 - EPC C or above', '5 - property was surveyed' + gain_columns = sorted([x for x in results["Confidence Tier"].unique() if "1 - " in x or "2 - " in x or "3 - " in x]) + loss_columns = sorted([x for x in results["Confidence Tier"].unique() if "4 - " in x or "5 - " in x]) - loss_columns = [ - '4 - no similar property, needs survey to confirm', - '5 - EPC C or above', - '5 - property was surveyed' - ] geographic_summary["Gain"] = geographic_summary[gain_columns].sum(axis=1) geographic_summary["Loss"] = geographic_summary[loss_columns].sum(axis=1) @@ -2283,7 +2328,7 @@ def propsed_wave_3_sample(): # Remaining loss allowed # remaining_loss_constraint = 230 - region_totals["Loss"] - remaining_loss_constraint = 250 + remaining_loss_constraint = 220 postcode_selected_rows, _ = optimise( gain=postcode_summary_unselected_regions["Gain"].values, loss=postcode_summary_unselected_regions["Loss"].values, From 294506853dd32fb9aa21ce6500d6eebed7e41be6 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 18 Nov 2024 18:24:26 +0000 Subject: [PATCH 19/31] adding in new features --- etl/customers/aiha/bid_numbers.py | 18 +++++- etl/customers/remote_assessments/app.py | 1 + .../stonewater/Wave 3 Preparation.py | 59 +++++++++++++++++-- 3 files changed, 71 insertions(+), 7 deletions(-) diff --git a/etl/customers/aiha/bid_numbers.py b/etl/customers/aiha/bid_numbers.py index 96859f99..b371e2e5 100644 --- a/etl/customers/aiha/bid_numbers.py +++ b/etl/customers/aiha/bid_numbers.py @@ -52,6 +52,20 @@ aiha_wave_3_features = aiha_original_asset_data[ wall_type_breakdown = aiha_wave_3_features["Wall type"].value_counts() property_type_breakdown = aiha_wave_3_features.groupby(["Property type", "floor"]).size().reset_index() +aiha_wave_3_features[aiha_wave_3_features["Property type"] == "Flat"][["Street address", "Postcode"]] + +# 4 Yetev Lev Court  ... Semi-Detached mid - Medium +# B 86 Bethune Road ... Mid-Terrace top. - Low +# A 80 Bethune Road ... Mid-Terrace ground. - Low +# B 80 Bethune Road ... \n \n - Low +# A 9 Clapton Common ... Semi-Detached ground. - Low +# C 9 Clapton Common ... End-Terrace \n. - Low +# B 89 Manor Road ... \n \n. - Low +# A 6 Northfield Road ... Detached top. - Low +# 13 Northfield Rd ... Semi-Detached \n - Low +# A 73 Manor Road ... End-Terrace \n - Low +# B 73 Manor Road ... Detached top - Low + # Hornsey data - contained in original asset list hornsey_asset_list = pd.read_excel( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/SHDF - Template - EOI - Hornsey Housing " @@ -88,5 +102,5 @@ caha_epc_data = pd.read_excel( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/caha_extracted_property_data.xlsx" ) -caha_epc_data["property_type"].value_counts() -caha_epc_data["wall_type"].value_counts() +caha_epc_data[caha_epc_data["address"] != "33 Woodhouse Road"]["property_type"].value_counts() +caha_epc_data[caha_epc_data["address"] != "33 Woodhouse Road"]["wall_type"].value_counts() diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py index 33015d87..59e0e868 100644 --- a/etl/customers/remote_assessments/app.py +++ b/etl/customers/remote_assessments/app.py @@ -17,6 +17,7 @@ def app(): "address": "5, Lynton Street", "postcode": "DE22 3RW" } + ] asset_list = pd.DataFrame(asset_list) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 974cd908..81b5915f 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -6,6 +6,7 @@ import numpy as np from tqdm import tqdm from collections import Counter from scipy.optimize import linprog +from utils.s3 import read_pickle_from_s3 CUSTOMER_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater" SURVEY_FOLDERS = os.path.join(CUSTOMER_FOLDER_PATH, "StonewaterSurveys_{i}") @@ -1264,7 +1265,7 @@ def main(): stonewater_data[c] = stonewater_data[c].astype(str) # Save this data to excel - stonewater_data.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages V2.xlsx", index=False) + stonewater_data.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages V3.xlsx", index=False) cost_sheet = [ { @@ -1654,17 +1655,66 @@ def propsed_wave_3_sample(): "Property Type", "Wall Type", "Roof Type", "Heating"] ] + # Updated packages: to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages V3.xlsx", index=False) survey_results = pd.read_excel( os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.24.xlsx"), header=13, sheet_name="Modelled Packages" ) + additional_survey_data = pd.read_excel( + os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - costed retrofit packages V3.xlsx"), + header=0 + ) + survey_results = survey_results.merge( + additional_survey_data[ + [ + "Address ID", + "Main Wall Type", "Main Wall Insulation_x", "Main Wall Thickness", + "Main Building Alternative Wall Type", "Main Building Alternative Wall Insulation", + "Main Building Alternative Wall Thickness" + ] + ].rename(columns={"Main Wall Insulation_x": "Main Wall Insulation Type"}), + how="left", + on="Address ID" + ) + # TOOD: We probably want the actual surveyed wall, roof, heating type survey_results = survey_results[ - ["Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Postcode"] - ] - survey_results["Postal Region"] = survey_results["Postcode"].str.split(" ").str[0] + [ + "Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Postcode", + "Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness", + "Existing Primary Heating System", + "Main Wall Type", "Main Wall Insulation Type", "Main Wall Thickness", + "Main Building Alternative Wall Type", "Main Building Alternative Wall Insulation", + "Main Building Alternative Wall Thickness" + ] + ].rename( + columns={ + "Existing Primary Heating System": "Surveyed Primary Heating System" + } + ) + + # Concatenate from the wall information + survey_results["Surveyed: Wall Type"] = survey_results["Main Wall Type"] + ": " + survey_results[ + "Main Wall Insulation Type"] + # Alternative wall + survey_results["Survey: Main Alternative Wall"] = ( + survey_results["Main Building Alternative Wall Type"] + ": " + survey_results[ + "Main Building Alternative Wall Insulation"] + ) + # Roof information + survey_results["Survey: Type"] = survey_results["Main Roof Type"] + ": " + survey_results[ + "Main Roof Insulation"] + ": " + survey_results["Main Roof Insulation Thickness"].astype(str) + + # Drop the individual columns: + survey_results = survey_results.drop( + columns=[ + "Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness", + "Main Wall Type", "Main Wall Insulation Type", + "Main Building Alternative Wall Type", "Main Building Alternative Wall Insulation" + ] + ) survey_results_with_original_features = survey_results.merge( asset_list[["UPRN", "Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]], @@ -1676,7 +1726,6 @@ def propsed_wave_3_sample(): raise ValueError("Something went wrong") # We get longitude & Latitude - from utils.s3 import read_pickle_from_s3 archetyping_spatial_features = read_pickle_from_s3( bucket_name="retrofit-data-dev", s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl", ) From 377d9929e418073567b6af8f589eb5fe58e92a1e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 18 Nov 2024 19:21:35 +0000 Subject: [PATCH 20/31] cleaning roof extraction --- .../stonewater/Wave 3 Preparation.py | 100 +++++++++++++----- 1 file changed, 72 insertions(+), 28 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 81b5915f..aa9e4488 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -291,26 +291,11 @@ def extract_summary_report(pdf_path): data["Number of LEL Fittings"] = int(re.search(r"Total number of L.E.L. fittings\s*(\d+)", text).group(1)) data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] - roof_section = re.search(r"8\.0 Roofs:\n(.*?)\n9\.0 Floors:", text, re.DOTALL) - roof_text = roof_section.group(1).strip() - roof_type_match = re.search(r"Type\s*([A-Za-z0-9\s]+)", roof_text) - data["Main Roof Type"] = roof_type_match.group(1).strip() if roof_type_match else None - - # Check if "Insulation" exists between Type and Insulation Thickness - insulation_search = re.search( - r"Type\s+.*?\n(Insulation\s+(.*?)\n)?(Insulation Thickness\s+(.*?)\n)", roof_text, re.DOTALL - ) - - if insulation_search: - # Insulation match will be present if it exists, otherwise it will be None - insulation_match = insulation_search.group(2) # Optional group for Insulation - insulation_thickness_match = insulation_search.group(4) # Required group for Insulation Thickness - - # Populate insulation fields - data["Main Roof Insulation"] = insulation_match.strip() if insulation_match else None - data["Main Roof Insulation Thickness"] = ( - insulation_thickness_match.strip() if insulation_thickness_match else None - ) + extracted_roof_data = extract_roof_details_summary(text) + main_roof_data = [roof for roof in extracted_roof_data if "Main" in roof["Building Part"]][0] + data["Main Roof Type"] = main_roof_data["Roof Type"] + data["Main Roof Insulation"] = main_roof_data["Roof Insulation"] + data["Main Roof Insulation Thickness"] = main_roof_data["Roof Insulation Thickness"] walls_data = extract_wall_details_summary(text) # Get the main building wall data @@ -593,6 +578,54 @@ def extract_roof_details_epr(text): return roof_data +def extract_roof_details_summary(text): + """ + Extracts roof type, insulation, and insulation thickness for each building part + in the 8.0 Roofs section of the summary report. + """ + # Define data structure to hold results + roof_data = [] + + # Locate the entire 8.0 Roofs section + roof_section_match = re.search(r"8\.0 Roofs:\n(.*?)(?=\n9\.0 Floors:|$)", text, re.DOTALL) + if not roof_section_match: + return roof_data # Return empty if no roof section is found + + # Extract the roof section and append "9.0 Floors:" as the boundary + roof_section = roof_section_match.group(1).strip() + "\n9.0 Floors:" + + # Define pattern to match each building part's roof entry + building_part_pattern = re.compile( + r"(Main Property|1st Extension|2nd Extension|[\w\s]+)\n" # Matches each building part label + r"Type\s+(.*?)(?=\n(?:Insulation|9\.0 Floors:|[A-Z]))" # Matches Roof Type until the next field, label, or end + r"(?:\nInsulation\s+(.*?)(?=\n(?:Insulation Thickness|9\.0 Floors:|[A-Z])))?" # Optional Insulation + r"(?:\nInsulation Thickness\s+(.*?)(?=\n(?:9\.0 Floors:|[A-Z])))?", # Optional Insulation Thickness + re.DOTALL + ) + + # Extract each building part's data + for match in building_part_pattern.finditer(roof_section): + part_name = match.group(1).strip() # Building part label + roof_type = match.group(2).strip() # Roof Type + roof_insulation = match.group(3).strip() if match.group(3) else None # Optional Insulation + roof_insulation_thickness = match.group(4).strip() if match.group(4) else None # Optional Thickness + + # Cleaning to handle annoying cases when it comes out like this: + # 'A Another dwelling above\n1st Extension' + if roof_type.startswith("A Another dwelling above"): + roof_type = "A Another dwelling above" + + # Store results for this building part + roof_data.append({ + "Building Part": part_name, + "Roof Type": roof_type, + "Roof Insulation": roof_insulation, + "Roof Insulation Thickness": roof_insulation_thickness, + }) + + return roof_data + + def extract_wall_details_epr(text): """ Extracts wall type, insulation, dry-lining, and thickness for each building part @@ -1691,21 +1724,21 @@ def propsed_wave_3_sample(): ] ].rename( columns={ - "Existing Primary Heating System": "Surveyed Primary Heating System" + "Existing Primary Heating System": "Survey: Primary Heating System" } ) # Concatenate from the wall information - survey_results["Surveyed: Wall Type"] = survey_results["Main Wall Type"] + ": " + survey_results[ - "Main Wall Insulation Type"] + survey_results["Survey: Main Wall Type"] = survey_results["Main Wall Type"].astype(str) + ": " + survey_results[ + "Main Wall Insulation Type"].astype(str) # Alternative wall survey_results["Survey: Main Alternative Wall"] = ( - survey_results["Main Building Alternative Wall Type"] + ": " + survey_results[ - "Main Building Alternative Wall Insulation"] + survey_results["Main Building Alternative Wall Type"].astype(str) + ": " + survey_results[ + "Main Building Alternative Wall Insulation"].astype(str) ) # Roof information - survey_results["Survey: Type"] = survey_results["Main Roof Type"] + ": " + survey_results[ - "Main Roof Insulation"] + ": " + survey_results["Main Roof Insulation Thickness"].astype(str) + survey_results["Survey: Main Roof Type"] = survey_results["Main Roof Type"].astype(str) + ": " + survey_results[ + "Main Roof Insulation"].astype(str) + ": " + survey_results["Main Roof Insulation Thickness"].astype(str) # Drop the individual columns: survey_results = survey_results.drop( @@ -1834,6 +1867,11 @@ def propsed_wave_3_sample(): return surveyed + survey_attribute_columns = [ + "Survey: Main Wall Type", 'Survey: Main Alternative Wall', 'Survey: Main Roof Type', + 'Survey: Primary Heating System' + ] + results = [] for region in tqdm(unique_postal_regions): # Take all of the properties in that region @@ -1845,7 +1883,8 @@ def propsed_wave_3_sample(): ] region_assets = region_assets.merge( - exact_surveyed[["Address ID", "Current EPC Band"]], + exact_surveyed[ + ["Address ID", "Current EPC Band", "Current SAP Rating"] + survey_attribute_columns], on="Address ID", how="left" ) @@ -2286,6 +2325,11 @@ def propsed_wave_3_sample(): results = pd.concat(results) + # Check if there are missings in current epc band, current sap rating or any of the survey attributes + for c in ["Current EPC Band", "Current SAP Rating"] + survey_attribute_columns: + if pd.isnull(results[c]).sum(): + raise Exception("Something went wrong") + # home = results[results["Confidence Tier"] == "5 - EPC C or above"].sample(1) # region = home["Postal Region"].values[0] From a7857c0375949f5d45d47afe41f59e07de883e71 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 18 Nov 2024 20:30:57 +0000 Subject: [PATCH 21/31] pulling out data from best match --- .../stonewater/Wave 3 Preparation.py | 111 ++++++++++-------- etl/find_my_epc/RetrieveFindMyEpc.py | 1 + etl/route_march_data_pull/app.py | 65 ++++------ 3 files changed, 83 insertions(+), 94 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index aa9e4488..08236d5b 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1727,7 +1727,7 @@ def propsed_wave_3_sample(): "Existing Primary Heating System": "Survey: Primary Heating System" } ) - + survey_results["Postal Region"] = survey_results["Postcode"].str.split(" ").str[0] # Concatenate from the wall information survey_results["Survey: Main Wall Type"] = survey_results["Main Wall Type"].astype(str) + ": " + survey_results[ "Main Wall Insulation Type"].astype(str) @@ -1872,6 +1872,8 @@ def propsed_wave_3_sample(): 'Survey: Primary Heating System' ] + survey_results["Survey: Matching Address ID"] = survey_results["Address ID"].copy() + results = [] for region in tqdm(unique_postal_regions): # Take all of the properties in that region @@ -1884,10 +1886,14 @@ def propsed_wave_3_sample(): region_assets = region_assets.merge( exact_surveyed[ - ["Address ID", "Current EPC Band", "Current SAP Rating"] + survey_attribute_columns], + ["Address ID", "Current EPC Band", "Current SAP Rating"] + survey_attribute_columns + [ + "Survey: Matching Address ID" + ] + ], on="Address ID", how="left" ) + region_assets['Distance to Closest Match (m)'] = 0 # Label the tier 1 properties region_assets["Confidence Tier"] = None @@ -1901,61 +1907,62 @@ def propsed_wave_3_sample(): "5 - property was surveyed", region_assets["Confidence Tier"] ) - archetypes = region_assets[ + archetype_ids = region_assets[ pd.isnull(region_assets["Confidence Tier"]) ]["Archetype ID"].unique() # We get the properties that have been surveyed - region_surveyed = survey_results[ - survey_results["Archetype ID"].isin(archetypes) & - (survey_results["Postal Region"] == region) - ][["Archetype ID", "Current EPC Band"]].drop_duplicates() - if region_surveyed["Archetype ID"].duplicated().sum(): + region_surveyed = [] + for arch_id in archetype_ids: + for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows(): + archetype_data = survey_results_with_original_features[ + survey_results["Archetype ID"] == arch_id + ].copy() + if archetype_data.empty: + continue + if archetype_data.shape[0] > 1: + # Look for an exact match, or as close as possible + archetype_data_filtered = match_property_to_surveyed(property, archetype_data) + if not archetype_data_filtered.empty: + archetype_data = archetype_data_filtered - region_surveyed = [] - for arch_id in archetypes: - for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows(): - archetype_data = survey_results_with_original_features[ - survey_results["Archetype ID"] == arch_id - ].copy() - if archetype_data.empty: - continue - if archetype_data.shape[0] > 1: - # Look for an exact match, or as close as possible - archetype_data_filtered = match_property_to_surveyed(property, archetype_data) - if not archetype_data_filtered.empty: - archetype_data = archetype_data_filtered + archetype_data["distance_meters"] = haversine( + lat1=property.latitude, lon1=property.longitude, + lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values + ) + expected_sap = np.average( + archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1) + ) + expected_epc = sap_to_epc(expected_sap) - archetype_data["distance_meters"] = haversine( - lat1=property.latitude, lon1=property.longitude, - lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values - ) - expected_sap = np.average( - archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1) - ) - expected_epc = sap_to_epc(expected_sap) - region_surveyed.append( - { - "Archetype ID": arch_id, - "Address ID": property["Address ID"], - "Current EPC Band": expected_epc - } - ) + # We take the features of the closest matching property + closest_match = archetype_data.sort_values("distance_meters", ascending=True).iloc[0] - region_surveyed = pd.DataFrame(region_surveyed) - region_assets = region_assets.merge( - region_surveyed, - on=["Archetype ID", "Address ID"], - how="left", - suffixes=("", "_method1") - ) - else: - region_assets = region_assets.merge( - region_surveyed, - on="Archetype ID", - how="left", - suffixes=("", "_method1") - ) + region_surveyed.append( + { + "Archetype ID": arch_id, + "Address ID": property["Address ID"], + "Current EPC Band": expected_epc, + "Current SAP Rating": expected_sap, + 'Survey: Main Wall Type': closest_match["Survey: Main Wall Type"], + 'Survey: Main Alternative Wall': closest_match["Survey: Main Alternative Wall"], + 'Survey: Main Roof Type': closest_match["Survey: Main Roof Type"], + 'Survey: Primary Heating System': closest_match["Survey: Primary Heating System"], + "Survey: Matching Address ID": closest_match["Address ID"], + 'Distance to Closest Match (m)': closest_match["distance_meters"] + } + ) + + region_surveyed = pd.DataFrame(region_surveyed) + starting_shape = region_assets.shape[0] + region_assets = region_assets.merge( + region_surveyed, + on=["Archetype ID", "Address ID"], + how="left", + suffixes=("", "_method1") + ) + if region_assets.shape[0] != starting_shape: + raise ValueError("Something went wrong") # Label the tier 1 properties region_assets["Confidence Tier"] = np.where( @@ -2326,7 +2333,9 @@ def propsed_wave_3_sample(): results = pd.concat(results) # Check if there are missings in current epc band, current sap rating or any of the survey attributes - for c in ["Current EPC Band", "Current SAP Rating"] + survey_attribute_columns: + for c in ( + ["Current EPC Band", "Current SAP Rating", "Survey: Matching Address ID", 'Distance to Closest Match (m)'] + + survey_attribute_columns): if pd.isnull(results[c]).sum(): raise Exception("Something went wrong") diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index 913a04b8..d5a5134f 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -269,6 +269,7 @@ class RetrieveFindMyEpc: "Loft insulation": ["loft_insulation"], "Solar photovoltaic (PV) panels": ["solar_pv"], "Party wall insulation": ["party_wall_insulation"], + 'Draught proofing': ["draught_proofing"], } survey = True diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index f24c5bb2..1e478b0c 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -23,41 +23,8 @@ EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") def get_data(asset_list, fulladdress_column, address1_column, postcode_column): epc_data = [] errors = [] + no_epc = [] for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): - postcode = home[postcode_column] - house_number = home[address1_column] - full_address = home[fulladdress_column] - - searcher = SearchEpc( - address1=str(house_number), - postcode=postcode, - auth_token=EPC_AUTH_TOKEN, - os_api_key="", - property_type=None, - fast=True, - full_address=full_address, - max_retries=5 - ) - # Force the skipping of estimating the EPC - searcher.ordnance_survey_client.property_type = None - searcher.ordnance_survey_client.built_form = None - - searcher.find_property(skip_os=True) - if searcher.newest_epc is None: - continue - - # Look for EPC recommendatons - try: - property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) - except: - property_recommendations = {"rows": []} - - # Retrieve data from FindMyEPC - find_epc_searcher = RetrieveFindMyEpc( - address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"] - ) - find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() - time.sleep(np.random.uniform(0.1, 1)) try: postcode = home[postcode_column] house_number = home[address1_column] @@ -79,6 +46,7 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column): searcher.find_property(skip_os=True) if searcher.newest_epc is None: + no_epc.append(home["row_id"]) continue # Look for EPC recommendatons @@ -106,7 +74,7 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column): errors.append(home["row_id"]) time.sleep(5) - return epc_data, errors + return epc_data, errors, no_epc def extract_address1(asset_list, full_address_col, method="first_two_words"): @@ -140,26 +108,37 @@ def app(): Property UPRN """ - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/P&F/" - DATA_FILENAME = "BELOW C - WFT FINDINGS ON INSPECTION PLUS SUGGESTED ACTION.xlsx" + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/" + DATA_FILENAME = "Bromford programme review.xlsx" + SHEET_NAME = "Bromford" POSTCODE_COLUMN = "Postcode" - FULLADDRESS_COLUMN = "Address" - ADDRESS1_COLUMN = None + FULLADDRESS_COLUMN = None + ADDRESS1_COLUMN = "No." ADDRESS1_METHOD = "first_two_words" + ADDRESS_COLS_TO_CONCAT = ["No.", "Address"] - asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0) + asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME) + asset_list = asset_list[~pd.isnull(asset_list["Postcode"])] asset_list["row_id"] = asset_list.index # We clean up portential non-breaking spaces, and double spaces for col in [c for c in [POSTCODE_COLUMN, FULLADDRESS_COLUMN, ADDRESS1_COLUMN] if c is not None]: + asset_list[col] = asset_list[col].astype(str) asset_list[col] = asset_list[col].str.replace('\xa0', ' ', regex=False) asset_list[col] = asset_list[col].str.replace(' ', ' ', regex=False) if ADDRESS1_COLUMN is None: ADDRESS1_COLUMN = "address1_extracted" - asset_list = extract_address1(asset_list, FULLADDRESS_COLUMN, ADDRESS1_METHOD) + asset_list = extract_address1( + asset_list=asset_list, full_address_col=FULLADDRESS_COLUMN, method=ADDRESS1_METHOD + ) - epc_data, errors = get_data( + if FULLADDRESS_COLUMN is None: + FULLADDRESS_COLUMN = "fulladdress_extracted" + # We concatenate the columns in ADDRESS_COLS_TO_CONCAT, on commas + asset_list[FULLADDRESS_COLUMN] = asset_list[ADDRESS_COLS_TO_CONCAT].apply(lambda x: ", ".join(x), axis=1) + + epc_data, errors, no_epc = get_data( asset_list=asset_list, fulladdress_column=FULLADDRESS_COLUMN, address1_column=ADDRESS1_COLUMN, @@ -168,7 +147,7 @@ def app(): # We now retrieve any failed properties asset_list_failed = asset_list[asset_list["row_id"].isin(errors)] - epc_data_failed, _ = get_data( + epc_data_failed, _, _ = get_data( asset_list=asset_list_failed, fulladdress_column=FULLADDRESS_COLUMN, address1_column=ADDRESS1_COLUMN, From 7accbded137918ba4e38c5b6ed79703b0e727e3d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 18 Nov 2024 21:38:00 +0000 Subject: [PATCH 22/31] debugging find epc pull --- etl/find_my_epc/RetrieveFindMyEpc.py | 21 ++++++++++++++++++++- etl/route_march_data_pull/app.py | 22 ++++++++++++++++++---- 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index d5a5134f..ac0e8235 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -126,6 +126,7 @@ class RetrieveFindMyEpc: # Find all h3 headers for each step and extract their related information step_headers = recommendations_div.find_all('h3', class_='govuk-heading-m') previous_sap_score = current_sap + previous_epc = current_rating.split(' ')[-6] for step_num, step_header in enumerate(step_headers, start=1): # Extract the step title (the measure) measure_title = step_header.text.strip().replace(f"Step {step_num}: ", "") @@ -138,7 +139,11 @@ class RetrieveFindMyEpc: # Check if the potential rating div is found if potential_rating_div: # Extract the rating text within the SVG text element - rating_text = potential_rating_div.find('text', class_='govuk-!-font-weight-bold').text.strip() + extracted_rating_text = potential_rating_div.find('text', class_='govuk-!-font-weight-bold') + if extracted_rating_text is not None: + rating_text = extracted_rating_text.text.strip() + else: + rating_text = " ".join([str(previous_sap_score), previous_epc]) # Parse the rating text to separate the numeric rating and EPC letter new_rating = int(rating_text.split()[0]) new_epc = rating_text.split()[1] @@ -152,6 +157,7 @@ class RetrieveFindMyEpc: "sap_points": new_rating - previous_sap_score }) previous_sap_score = new_rating + previous_epc = new_epc # Search for the assessment informaton assessment_information = address_res.find('div', {'id': 'information'}) @@ -270,6 +276,19 @@ class RetrieveFindMyEpc: "Solar photovoltaic (PV) panels": ["solar_pv"], "Party wall insulation": ["party_wall_insulation"], 'Draught proofing': ["draught_proofing"], + "Roof insulation recommendation": [], + "Cavity wall insulation recommendation": [], + "Windows draught proofing": [], + "Low energy lighting for all fixed outlets": ["low_energy_lighting"], + "Cylinder thermostat recommendation": [], + "Heating controls recommendation": [], + "Replace boiler with Band A condensing boiler": [], + "Solar panel recommendation": [], + "Double glazing recommendation": [], + "Solid wall insulation recommendation": [], + "Fuel change recommendation": [], + "PV Cells recommendation": [], + "Replacement glazing units": ["double_glazing"], } survey = True diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 1e478b0c..80caefc9 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -1,5 +1,6 @@ import os import time +from idlelib.iomenu import errors import pandas as pd import numpy as np @@ -21,6 +22,8 @@ EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") def get_data(asset_list, fulladdress_column, address1_column, postcode_column): + home = asset_list[asset_list["row_id"].isin(errors)].head(1).tail(1).squeeze() + epc_data = [] errors = [] no_epc = [] @@ -56,10 +59,21 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column): property_recommendations = {"rows": []} # Retrieve data from FindMyEPC - find_epc_searcher = RetrieveFindMyEpc( - address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"] - ) - find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + try: + find_epc_searcher = RetrieveFindMyEpc( + address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"] + ) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + except ValueError as e: + if "No EPC found" in str(e): + find_epc_searcher = RetrieveFindMyEpc( + address=searcher.newest_epc["address1"], postcode=searcher.newest_epc["postcode"] + ) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + else: + find_epc_data = {} + except Exception as e: + raise Exception(f"Error retrieving FindMyEPC data: {e}") time.sleep(np.random.uniform(0.1, 1)) epc = { From 6eb52a509ebb8a110ca09533e4cba85b66edacf2 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 18 Nov 2024 21:38:40 +0000 Subject: [PATCH 23/31] removing error line --- etl/route_march_data_pull/app.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 80caefc9..d9f6bf43 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -22,8 +22,6 @@ EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") def get_data(asset_list, fulladdress_column, address1_column, postcode_column): - home = asset_list[asset_list["row_id"].isin(errors)].head(1).tail(1).squeeze() - epc_data = [] errors = [] no_epc = [] From ac9b7b37300204c83f862871ebd511208625978b Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 18 Nov 2024 22:08:10 +0000 Subject: [PATCH 24/31] updating methdology for matching --- .../stonewater/Wave 3 Preparation.py | 193 +++++++++++------- 1 file changed, 114 insertions(+), 79 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 08236d5b..f74dc19d 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1867,6 +1867,19 @@ def propsed_wave_3_sample(): return surveyed + def fill_survey_columns(region_assets, suffix): + for col in [ + 'Current EPC Band', 'Current SAP Rating', + 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', + 'Survey: Main Roof Type', 'Survey: Primary Heating System', + 'Survey: Matching Address ID', 'Distance to Closest Match (m)' + ]: + region_assets[col] = np.where( + pd.isnull(region_assets[col]) & pd.notnull(region_assets[col + suffix]), + region_assets[col + suffix], region_assets[col] + ) + return region_assets + survey_attribute_columns = [ "Survey: Main Wall Type", 'Survey: Main Alternative Wall', 'Survey: Main Roof Type', 'Survey: Primary Heating System' @@ -1920,6 +1933,14 @@ def propsed_wave_3_sample(): ].copy() if archetype_data.empty: continue + + match_type = "2 - same archetype" + if any(archetype_data["Postal Region"] == property["Postal Region"]): + match_type = "1 - same archetype, same postal region" + archetype_data = archetype_data[ + archetype_data["Postal Region"] == property["Postal Region"] + ] + if archetype_data.shape[0] > 1: # Look for an exact match, or as close as possible archetype_data_filtered = match_property_to_surveyed(property, archetype_data) @@ -1949,11 +1970,21 @@ def propsed_wave_3_sample(): 'Survey: Main Roof Type': closest_match["Survey: Main Roof Type"], 'Survey: Primary Heating System': closest_match["Survey: Primary Heating System"], "Survey: Matching Address ID": closest_match["Address ID"], - 'Distance to Closest Match (m)': closest_match["distance_meters"] + 'Distance to Closest Match (m)': closest_match["distance_meters"], + "Match Type": match_type } ) - region_surveyed = pd.DataFrame(region_surveyed) + + if region_surveyed.empty: + region_surveyed = pd.DataFrame( + columns=[ + "Archetype ID", "Address ID", "Current EPC Band", "Current SAP Rating", + 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', 'Survey: Main Roof Type', + 'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)' + ] + ) + starting_shape = region_assets.shape[0] region_assets = region_assets.merge( region_surveyed, @@ -1968,95 +1999,99 @@ def propsed_wave_3_sample(): region_assets["Confidence Tier"] = np.where( region_assets["Current EPC Band_method1"].isin(["D", "E", "F", "G"]) & pd.isnull(region_assets["Confidence Tier"]), - "1 - Archetype surveyed", region_assets["Confidence Tier"] + "1 - Archetype surveyed in region", region_assets["Confidence Tier"] ) - region_assets["Current EPC Band"] = np.where( - pd.isnull(region_assets["Current EPC Band"]) & pd.notnull(region_assets["Current EPC Band_method1"]), - region_assets["Current EPC Band_method1"], region_assets["Current EPC Band"] - ) # Handle EPC C region_assets["Confidence Tier"] = np.where( - region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]), + region_assets["Current EPC Band_method1"].isin(["C", "B", "F", "G"]) & + pd.isnull(region_assets["Confidence Tier"]), "5 - EPC C or above", region_assets["Confidence Tier"] ) - region_assets = region_assets.drop(columns=["Current EPC Band_method1"]) - # TODO: Turn into a function - missed_archetypes = set(archetypes) - set(region_surveyed["Archetype ID"]) + region_assets = fill_survey_columns(region_assets, suffix="_method1") - archetype_surveyed = survey_results[ - survey_results["Archetype ID"].isin(missed_archetypes) - ][["Archetype ID", "Current EPC Band"]].drop_duplicates() + method_1_columns = [c for c in region_assets.columns if c.endswith("_method1")] + region_assets = region_assets.drop(columns=method_1_columns) - if archetype_surveyed["Archetype ID"].duplicated().sum(): + missed_archetypes = set(archetype_ids) - set(region_surveyed["Archetype ID"]) - archetype_surveyed = [] - for arch_id in missed_archetypes: - for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows(): - archetype_data = survey_results_with_original_features[ - survey_results["Archetype ID"] == arch_id - ].copy() - if archetype_data.empty: - continue - archetype_data["distance_meters"] = haversine( - lat1=property.latitude, lon1=property.longitude, - lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values - ) - expected_sap = np.average( - archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1) - ) - expected_epc = sap_to_epc(expected_sap) - archetype_surveyed.append( - { - "Archetype ID": arch_id, - "Address ID": property["Address ID"], - "Current EPC Band": expected_epc - } - ) - archetype_surveyed = pd.DataFrame(archetype_surveyed) - region_assets = region_assets.merge( - archetype_surveyed, - on=["Archetype ID", "Address ID"], - how="left", - suffixes=("", "_method2") - ) - else: - region_assets = region_assets.merge( - archetype_surveyed, - on="Archetype ID", - how="left", - suffixes=("", "_method2") - ) - - region_assets["Confidence Tier"] = np.where( - region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]) & pd.isnull( - region_assets["Confidence Tier"]), - "2 - same archetype", region_assets["Confidence Tier"] - ) - - region_assets["Current EPC Band"] = np.where( - pd.isnull(region_assets["Current EPC Band"]) & pd.notnull(region_assets["Current EPC Band_method2"]), - region_assets["Current EPC Band_method2"], region_assets["Current EPC Band"] - ) - - region_assets = region_assets.drop(columns=["Current EPC Band_method2"]) + archetype_surveyed = [] + for arch_id in missed_archetypes: + for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows(): + archetype_data = survey_results_with_original_features[ + survey_results["Archetype ID"] == arch_id + ].copy() + if archetype_data.empty: + continue + raise Exception("IMPLEMENT ME") + # archetype_data["distance_meters"] = haversine( + # lat1=property.latitude, lon1=property.longitude, + # lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values + # ) + # expected_sap = np.average( + # archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1) + # ) + # expected_epc = sap_to_epc(expected_sap) + # archetype_surveyed.append( + # { + # "Archetype ID": arch_id, + # "Address ID": property["Address ID"], + # "Current EPC Band": expected_epc + # } + # ) + # archetype_surveyed = pd.DataFrame(archetype_surveyed) + # if archetype_surveyed.empty: + # archetype_surveyed = pd.DataFrame( + # columns=[ + # "Archetype ID", "Address ID", "Current EPC Band", "Current SAP Rating", + # 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', 'Survey: Main Roof Type', + # 'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)' + # ] + # ) + # + # region_assets = region_assets.merge( + # archetype_surveyed, + # on=["Archetype ID", "Address ID"], + # how="left", + # suffixes=("", "_method2") + # ) + # + # region_assets["Confidence Tier"] = np.where( + # region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]) & pd.isnull( + # region_assets["Confidence Tier"]), + # "2 - same archetype", region_assets["Confidence Tier"] + # ) + # + # for col in [ + # 'Current EPC Band', 'Current SAP Rating', + # 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', + # 'Survey: Main Roof Type', 'Survey: Primary Heating System', + # 'Survey: Matching Address ID', 'Distance to Closest Match (m)' + # ]: + # region_assets[col] = np.where( + # pd.isnull(region_assets[col]) & pd.notnull(region_assets[col + "_method2"]), + # region_assets[col + "_method2"], region_assets[col] + # ) + # + # method_2_columns = [c for c in region_assets.columns if c.endswith("_method2")] + # region_assets = region_assets.drop(columns=method_2_columns) # We label EPC C properties - region_assets["Confidence Tier"] = np.where( - region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]), - "5 - EPC C or above", region_assets["Confidence Tier"] - ) - - region_assets["Confidence Tier"] = np.where( - region_assets["Archetype ID"] == "EPC C OR ABOVE", - "5 - EPC C or above", region_assets["Confidence Tier"] - ) - - region_assets["Current EPC Band"] = np.where( - region_assets["Archetype ID"] == "EPC C OR ABOVE", - "C", region_assets["Current EPC Band"] - ) + # region_assets["Confidence Tier"] = np.where( + # region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]), + # "5 - EPC C or above", region_assets["Confidence Tier"] + # ) + # + # region_assets["Confidence Tier"] = np.where( + # region_assets["Archetype ID"] == "EPC C OR ABOVE", + # "5 - EPC C or above", region_assets["Confidence Tier"] + # ) + # + # region_assets["Current EPC Band"] = np.where( + # region_assets["Archetype ID"] == "EPC C OR ABOVE", + # "C", region_assets["Current EPC Band"] + # ) missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist() From 5d5001fec3114eab4ba84e7fc0e40270ec017d35 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 18 Nov 2024 22:47:39 +0000 Subject: [PATCH 25/31] added de-duping --- .../stonewater/Wave 3 Preparation.py | 221 ++++++------------ etl/find_my_epc/RetrieveFindMyEpc.py | 6 + etl/route_march_data_pull/app.py | 7 + 3 files changed, 85 insertions(+), 149 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index f74dc19d..744b3400 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1803,21 +1803,26 @@ def propsed_wave_3_sample(): def match_property_to_surveyed(property, survey_results_with_original_features): surveyed = survey_results_with_original_features[ + ( + survey_results_with_original_features["Postal Region"] == + property["Postal Region"] + ) & ( survey_results_with_original_features["Property Type"] == property["Property Type"] + ) + & + ( + survey_results_with_original_features["Wall Type"].str.split(":").str[0] == + property["Wall Type"].split(":")[0] ) & ( - survey_results_with_original_features["Wall Type"] == - property["Wall Type"] + survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + property["Roof Type"].split(":")[0] ) & ( - survey_results_with_original_features["Roof Type"] == - property["Roof Type"] - ) & - ( - survey_results_with_original_features["Heating"] == - property["Heating"] + survey_results_with_original_features["Heating"].str.split(":").str[0] == + property["Heating"].split(":")[0] ) ].copy() @@ -1826,23 +1831,47 @@ def propsed_wave_3_sample(): surveyed = survey_results_with_original_features[ ( - survey_results_with_original_features["Property Type"] == - property["Property Type"] + survey_results_with_original_features["Postal Region"] == + property["Postal Region"] ) & ( - survey_results_with_original_features["Wall Type"] == - property["Wall Type"] + survey_results_with_original_features["Property Type"].str.split(":").str[0] == + property["Property Type"].split(":")[0] + ) + & + ( + survey_results_with_original_features["Wall Type"].str.split(":").str[0] == + property["Wall Type"].split(":")[0] ) & ( survey_results_with_original_features["Roof Type"].str.split(":").str[0] == property["Roof Type"].split(":")[0] ) & ( - survey_results_with_original_features["Heating"] == - property["Heating"] + survey_results_with_original_features["Heating"].str.split(":").str[0] == + property["Heating"].split(":")[0] ) ].copy() + # surveyed = survey_results_with_original_features[ + # ( + # survey_results_with_original_features["Property Type"] == + # property["Property Type"] + # ) & + # ( + # survey_results_with_original_features["Wall Type"] == + # property["Wall Type"] + # ) & + # ( + # survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + # property["Roof Type"].split(":")[0] + # ) & + # ( + # survey_results_with_original_features["Heating"] == + # property["Heating"] + # ) + # ].copy() + if not surveyed.empty: return surveyed @@ -1906,7 +1935,12 @@ def propsed_wave_3_sample(): on="Address ID", how="left" ) - region_assets['Distance to Closest Match (m)'] = 0 + region_assets['Distance to Closest Match (m)'] = None + region_assets["Distance to Closest Match (m)"] = np.where( + ~pd.isnull(region_assets["Current EPC Band"]), + 0, + region_assets["Distance to Closest Match (m)"] + ) # Label the tier 1 properties region_assets["Confidence Tier"] = None @@ -2016,7 +2050,7 @@ def propsed_wave_3_sample(): missed_archetypes = set(archetype_ids) - set(region_surveyed["Archetype ID"]) - archetype_surveyed = [] + # archetype_surveyed = [] for arch_id in missed_archetypes: for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows(): archetype_data = survey_results_with_original_features[ @@ -2175,7 +2209,14 @@ def propsed_wave_3_sample(): { "Address ID": a_id, "Confidence Tier": "4 - no similar property, needs survey to confirm", - "Current EPC Band": "Needs Survey" + "Current EPC Band": "Needs Survey", + "Current SAP Rating": "Needs Survey", + 'Survey: Main Wall Type': "Not Surveyed", + "Survey: Main Alternative Wall": "Not Surveyed", + "Survey: Main Roof Type": "Not Surveyed", + "Survey: Primary Heating System": "Not Surveyed", + "Survey: Matching Address ID": "Not Surveyed", + 'Distance to Closest Match (m)': 9999999, } ) continue @@ -2197,18 +2238,6 @@ def propsed_wave_3_sample(): # Take the 3 nearest surveyed = surveyed.head(3) - # # We allow a max distance of 10km - # surveyed = surveyed[surveyed["distance_meters"] < 10000] - # if surveyed.empty: - # final_missed_matches.append( - # { - # "Address ID": a_id, - # "Confidence Tier": "4 - no similar property, needs survey to confirm", - # "Current EPC Band": "Needs Survey" - # } - # ) - # continue - # perform a weighted mean of SAP rating - the closer the better expected_sap = np.average( surveyed["Current SAP Rating"], weights=1 / (surveyed["distance_meters"] + 1) @@ -2218,129 +2247,24 @@ def propsed_wave_3_sample(): if expected_epc in ["C", "B", "A"]: match_type = "5 - EPC C or above" + closest_match = surveyed.iloc[0] + final_missed_matches.append( { "Address ID": a_id, "Confidence Tier": match_type, - "Current EPC Band": expected_epc + "Current EPC Band": expected_epc, + "Current SAP Rating": expected_sap, + 'Survey: Main Wall Type': closest_match["Survey: Main Wall Type"], + "Survey: Main Alternative Wall": closest_match["Survey: Main Alternative Wall"], + "Survey: Main Roof Type": closest_match["Survey: Main Roof Type"], + "Survey: Primary Heating System": closest_match["Survey: Primary Heating System"], + "Survey: Matching Address ID": closest_match["Address ID"], + 'Distance to Closest Match (m)': closest_match["distance_meters"], } ) continue - # if property["Property Type"].split(":")[0] in ["House", "Bungalow"]: - # filter_property_types = ["House", "Bungalow"] - # else: - # filter_property_types = ["Flat"] - # - # surveyed_similar = survey_results_with_original_features[ - # (survey_results_with_original_features["Postcode"] == property["Postcode"]) & - # ( - # survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( - # filter_property_types - # ) - # ) & - # ( - # survey_results_with_original_features["Wall Type"].str.split(":").str[0] == - # property["Wall Type"].split(":")[0] - # ) & - # ( - # survey_results_with_original_features["Roof Type"].str.split(":").str[0] == - # property["Roof Type"].split(":")[0] - # ) & - # ( - # survey_results_with_original_features["Heating"].str.split(":").str[0] == - # property["Heating"].split(":")[0] - # ) - # ] - # if surveyed_similar.empty: - # surveyed_similar = survey_results_with_original_features[ - # (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) & - # (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( - # filter_property_types - # )) & - # (survey_results_with_original_features["Wall Type"].str.split(":").str[0] == - # property["Wall Type"].split(":")[0]) & - # (survey_results_with_original_features["Roof Type"].str.split(":").str[0] == - # property["Roof Type"].split(":")[0]) & - # (survey_results_with_original_features["Heating"].str.split(":").str[0] == - # property["Heating"].split(":")[0]) - # ] - # - # if surveyed_similar.empty: - # - # # We get an average based on the postcode - # surveyed_similar = survey_results_with_original_features[ - # (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) & - # (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( - # filter_property_types - # )) - # ] - # if surveyed_similar.empty: - # surveyed_similar_entire_population = survey_results_with_original_features[ - # ( - # survey_results_with_original_features["Property Type"].str.split(":").str[0] == property[ - # "Property Type"].split(":")[0] - # ) & - # ( - # survey_results_with_original_features["Wall Type"].str.split(":").str[0] == - # property["Wall Type"].split(":")[0] - # ) & - # ( - # survey_results_with_original_features["Roof Type"].str.split(":").str[0] == - # property["Roof Type"].split(":")[0] - # ) & - # ( - # survey_results_with_original_features["Heating"].str.split(":").str[0] == - # property["Heating"].split(":")[0] - # ) - # ] - # - # # We order them by distance on postcode - # - # # Average - # expected_sap = surveyed_similar_entire_population["Current SAP Rating"].mean() - # expected_epc = sap_to_epc(expected_sap) - # - # final_missed_matches.append( - # { - # "Address ID": a_id, - # "Confidence Tier": "3 - similar property, all areas searched", - # "Current EPC Band": expected_epc - # } - # - # ) - # else: - # expected_sap = surveyed_similar["Current SAP Rating"].mean() - # expected_epc = sap_to_epc(expected_sap) - # if expected_epc in ["C", "B", "A"]: - # tier = "5 - EPC C or above" - # else: - # tier = "3 - similar property, relaxed conditions" - # - # final_missed_matches.append( - # { - # "Address ID": a_id, - # "Confidence Tier": tier, - # "Current EPC Band": expected_epc - # } - # ) - # continue - # # We take an average - # expected_sap = surveyed_similar["Current SAP Rating"].mean() - # expected_epc = sap_to_epc(expected_sap) - # if expected_epc in ["C", "B", "A"]: - # tier = "5 - EPC C or above" - # else: - # tier = "3 - similar property" - # - # final_missed_matches.append( - # { - # "Address ID": a_id, - # "Confidence Tier": tier, - # "Current EPC Band": expected_epc - # } - # ) - final_missed_matches = pd.DataFrame(final_missed_matches) region_assets = region_assets.merge( @@ -2353,12 +2277,11 @@ def propsed_wave_3_sample(): region_assets["Confidence Tier"] = region_assets["Confidence Tier"].fillna( region_assets["Confidence Tier_method3"] ) - region_assets["Current EPC Band"] = np.where( - pd.isnull(region_assets["Current EPC Band"]), - region_assets["Current EPC Band_method3"], region_assets["Current EPC Band"] - ) - region_assets = region_assets.drop(columns=["Confidence Tier_method3", "Current EPC Band_method3"]) + region_assets = fill_survey_columns(region_assets, suffix="_method3") + + method_3_columns = [c for c in region_assets.columns if c.endswith("_method3")] + region_assets = region_assets.drop(columns=method_3_columns) if pd.isnull(region_assets["Current EPC Band"]).sum(): raise Exception("Something went wrong") diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index ac0e8235..b6394275 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -289,6 +289,12 @@ class RetrieveFindMyEpc: "Fuel change recommendation": [], "PV Cells recommendation": [], "Replacement glazing units": ["double_glazing"], + "Heating controls (time and temperature zone control)": ["time_temperature_zone_control"], + "High heat retention storage heaters": ["high_heat_retention_storage_heaters"], + "Gas condensing boiler": ["boiler_upgrade"], + "Change room heaters to condensing boiler": ["boiler_upgrade"], + "Cylinder thermostat": ["cylinder_thermostat"], + "Heat recovery system for mixer showers": ["heat_recovery_shower"], } survey = True diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index d9f6bf43..6f9dd135 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -150,6 +150,13 @@ def app(): # We concatenate the columns in ADDRESS_COLS_TO_CONCAT, on commas asset_list[FULLADDRESS_COLUMN] = asset_list[ADDRESS_COLS_TO_CONCAT].apply(lambda x: ", ".join(x), axis=1) + # We check for duplicated addresses + asset_list["deduper"] = asset_list[FULLADDRESS_COLUMN] + asset_list[POSTCODE_COLUMN] + if asset_list["deduper"].duplicated().sum(): + # Drop the dupes + print(f"There are {asset_list['deduper'].duplicated().sum()} duplicated addresses - dropping") + asset_list = asset_list[~asset_list["deduper"].duplicated()] + epc_data, errors, no_epc = get_data( asset_list=asset_list, fulladdress_column=FULLADDRESS_COLUMN, From d65c99f62a0fd7cb6e1c58a5816db0e4e4477fb5 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 19 Nov 2024 08:41:44 +0000 Subject: [PATCH 26/31] tidying up optimisation process --- .../stonewater/Wave 3 Preparation.py | 105 ++++-------------- 1 file changed, 24 insertions(+), 81 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 744b3400..c8e61a0e 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -2297,39 +2297,9 @@ def propsed_wave_3_sample(): if pd.isnull(results[c]).sum(): raise Exception("Something went wrong") - # home = results[results["Confidence Tier"] == "5 - EPC C or above"].sample(1) - # region = home["Postal Region"].values[0] - - # Create a pivot table for counts of Confidence Tier by Postal Region - geographic_summary = results.pivot_table( - index='Postal Region', - columns='Confidence Tier', - aggfunc='size', - fill_value=0 - ).reset_index() - - # We create the gain and loss columns - # Gain is the sum of these columns: - # '1 - Archetype surveyed', - # '1 - property was surveyed', - # '2 - same archetype', - # '3 - similar property, weighted on distance' - gain_columns = sorted([x for x in results["Confidence Tier"].unique() if "1 - " in x or "2 - " in x or "3 - " in x]) loss_columns = sorted([x for x in results["Confidence Tier"].unique() if "4 - " in x or "5 - " in x]) - geographic_summary["Gain"] = geographic_summary[gain_columns].sum(axis=1) - geographic_summary["Loss"] = geographic_summary[loss_columns].sum(axis=1) - - print(geographic_summary.sum()) - - geographic_summary = geographic_summary.sort_values("Loss", ascending=True) - geographic_summary["Loss Cumulative Sum"] = geographic_summary["Loss"].cumsum() - geographic_summary[geographic_summary["Loss Cumulative Sum"] <= 250]["Gain"].sum() - - loss = geographic_summary["Loss"].values - gain = geographic_summary["Gain"].values - def optimise(gain, loss, max_loss=250): # Define the coefficients for the objective function (negative because we maximize Gain) @@ -2352,76 +2322,51 @@ def propsed_wave_3_sample(): return selected_rows, optimal_gain - selected_rows, _ = optimise(gain, loss, 250) - - # Select the rows that are selected - geographic_summary["Selected"] = selected_rows == 1 - geographic_summary[geographic_summary["Selected"]].sum() - - region_totals = geographic_summary[ - geographic_summary["Selected"] - ][["Gain", "Loss"]].sum() - - # We now see if there are any postcodes that have no loss that can be added - unselected_regions = geographic_summary[~geographic_summary["Selected"]]["Postal Region"].values - - # TODO: Try on street - - postcode_summary = results.pivot_table( + street_summary = results.pivot_table( index='Street and Region', columns='Confidence Tier', aggfunc='size', fill_value=0 ).reset_index() - # postcode_summary = postcode_summary.merge( - # results[["Postcode", "Postal Region"]].drop_duplicates(), - # how="left", on="Postcode" - # ) - # - postcode_summary_unselected_regions = postcode_summary.copy() - # postcode_summary_unselected_regions = postcode_summary[ - # postcode_summary["Postcode"].str.split(" ").str[0].isin(unselected_regions) - # ].copy() - postcode_summary_unselected_regions["Gain"] = postcode_summary_unselected_regions[gain_columns].sum(axis=1) - postcode_summary_unselected_regions["Loss"] = postcode_summary_unselected_regions[loss_columns].sum(axis=1) + street_summary["Gain"] = street_summary[gain_columns].sum(axis=1) + street_summary["Loss"] = street_summary[loss_columns].sum(axis=1) - # Remaining loss allowed - # remaining_loss_constraint = 230 - region_totals["Loss"] - remaining_loss_constraint = 220 - postcode_selected_rows, _ = optimise( - gain=postcode_summary_unselected_regions["Gain"].values, - loss=postcode_summary_unselected_regions["Loss"].values, - max_loss=int(remaining_loss_constraint) + print(street_summary.sum()) + + selected_rows, _ = optimise( + gain=street_summary["Gain"].values, + loss=street_summary["Loss"].values, + max_loss=250 ) - postcode_summary_unselected_regions["Selected"] = postcode_selected_rows == 1 - postcode_summary_unselected_regions[postcode_summary_unselected_regions["Selected"]][["Gain", "Loss"]].sum() + street_summary["Selected"] = selected_rows == 1 + print(street_summary[street_summary["Selected"]][["Gain", "Loss"]].sum()) - postcode_optimised_additional_properties = postcode_summary_unselected_regions[ - postcode_summary_unselected_regions["Selected"] + selected_streets = street_summary[ + street_summary["Selected"] ] - postcode_totals = postcode_optimised_additional_properties[["Gain", "Loss"]].sum() + totals = selected_streets[["Gain", "Loss"]].sum() - bid_size = postcode_totals.sum() + bid_size = totals.sum() print("Bid Size:", bid_size) - total_epc_d_or_below = postcode_totals["Gain"] + total_epc_d_or_below = totals["Gain"] print("Total EPC D or below:", total_epc_d_or_below) - total_epc_c = postcode_totals["Loss"] + total_epc_c = totals["Loss"] print("Total EPC C or above:", total_epc_c) # Total needing a survey - total_needing_survey = postcode_optimised_additional_properties[ + total_needing_survey = selected_streets[ "4 - no similar property, needs survey to confirm" ].sum() print("Total needing survey:", total_needing_survey) # Look for postcodes that have no loss - unselected_streets = postcode_summary_unselected_regions[ - ~postcode_summary_unselected_regions["Selected"] + unselected_streets = street_summary[ + ~street_summary["Selected"] ]["Street and Region"].values - postcode_summary2 = results[ + postcode_summary = results[ results["Street and Region"].isin(unselected_streets) ].pivot_table( index='Postcode', @@ -2430,14 +2375,12 @@ def propsed_wave_3_sample(): fill_value=0 ).reset_index() - postcode_summary2["Gain"] = postcode_summary2[gain_columns].sum(axis=1) - postcode_summary2["Loss"] = postcode_summary2[loss_columns].sum(axis=1) + postcode_summary["Gain"] = postcode_summary[gain_columns].sum(axis=1) + postcode_summary["Loss"] = postcode_summary[loss_columns].sum(axis=1) - no_loss_postcodes = postcode_summary2[postcode_summary2["Loss"] == 0].sort_values("Gain", ascending=False) + no_loss_postcodes = postcode_summary[postcode_summary["Loss"] == 0].sort_values("Gain", ascending=False) total_bid_size = bid_size + no_loss_postcodes["Gain"].sum() print(total_bid_size) - z = results[results["Confidence Tier"] == "5 - EPC C or above"] - # if __name__ == "__main__": # main() From d163ca99315b2e2c82b95ab629041351374fb081 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 19 Nov 2024 13:54:46 +0000 Subject: [PATCH 27/31] fixing filling of property --- .../stonewater/Wave 3 Preparation.py | 188 +++++++++--------- 1 file changed, 98 insertions(+), 90 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index c8e61a0e..426097e8 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1669,7 +1669,7 @@ def propsed_wave_3_sample(): header=4 ) - # TODO: We drop 302 properties that are not priority postcodes - confirm w/ Stonewater and 7 properties missing + # TODO: We drop 7 properties missing # UPRN asset_list = asset_list[~asset_list["Archetype ID"].isin(["MISSING UPRN"])] # Clean address ids @@ -1699,15 +1699,23 @@ def propsed_wave_3_sample(): os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - costed retrofit packages V3.xlsx"), header=0 ) - survey_results = survey_results.merge( + + survey_results = survey_results.drop( + columns=["Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness"] + ).merge( additional_survey_data[ [ "Address ID", "Main Wall Type", "Main Wall Insulation_x", "Main Wall Thickness", "Main Building Alternative Wall Type", "Main Building Alternative Wall Insulation", - "Main Building Alternative Wall Thickness" + "Main Building Alternative Wall Thickness", + "Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness" ] - ].rename(columns={"Main Wall Insulation_x": "Main Wall Insulation Type"}), + ].rename( + columns={ + "Main Wall Insulation_x": "Main Wall Insulation Type", + } + ), how="left", on="Address ID" ) @@ -1718,6 +1726,7 @@ def propsed_wave_3_sample(): "Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Postcode", "Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness", "Existing Primary Heating System", + "Package Ref", "Main Wall Type", "Main Wall Insulation Type", "Main Wall Thickness", "Main Building Alternative Wall Type", "Main Building Alternative Wall Insulation", "Main Building Alternative Wall Thickness" @@ -1727,6 +1736,7 @@ def propsed_wave_3_sample(): "Existing Primary Heating System": "Survey: Primary Heating System" } ) + survey_results["Postal Region"] = survey_results["Postcode"].str.split(" ").str[0] # Concatenate from the wall information survey_results["Survey: Main Wall Type"] = survey_results["Main Wall Type"].astype(str) + ": " + survey_results[ @@ -1929,7 +1939,7 @@ def propsed_wave_3_sample(): region_assets = region_assets.merge( exact_surveyed[ ["Address ID", "Current EPC Band", "Current SAP Rating"] + survey_attribute_columns + [ - "Survey: Matching Address ID" + "Survey: Matching Address ID", "Package Ref" ] ], on="Address ID", @@ -2005,6 +2015,7 @@ def propsed_wave_3_sample(): 'Survey: Primary Heating System': closest_match["Survey: Primary Heating System"], "Survey: Matching Address ID": closest_match["Address ID"], 'Distance to Closest Match (m)': closest_match["distance_meters"], + "Package Ref": closest_match["Package Ref"], "Match Type": match_type } ) @@ -2015,7 +2026,8 @@ def propsed_wave_3_sample(): columns=[ "Archetype ID", "Address ID", "Current EPC Band", "Current SAP Rating", 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', 'Survey: Main Roof Type', - 'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)' + 'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)', + "Match Type" ] ) @@ -2032,8 +2044,8 @@ def propsed_wave_3_sample(): # Label the tier 1 properties region_assets["Confidence Tier"] = np.where( region_assets["Current EPC Band_method1"].isin(["D", "E", "F", "G"]) & - pd.isnull(region_assets["Confidence Tier"]), - "1 - Archetype surveyed in region", region_assets["Confidence Tier"] + pd.isnull(region_assets["Confidence Tier"]) & ~pd.isnull(region_assets["Match Type"]), + region_assets["Match Type"], region_assets["Confidence Tier"] ) # Handle EPC C @@ -2046,86 +2058,7 @@ def propsed_wave_3_sample(): region_assets = fill_survey_columns(region_assets, suffix="_method1") method_1_columns = [c for c in region_assets.columns if c.endswith("_method1")] - region_assets = region_assets.drop(columns=method_1_columns) - - missed_archetypes = set(archetype_ids) - set(region_surveyed["Archetype ID"]) - - # archetype_surveyed = [] - for arch_id in missed_archetypes: - for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows(): - archetype_data = survey_results_with_original_features[ - survey_results["Archetype ID"] == arch_id - ].copy() - if archetype_data.empty: - continue - raise Exception("IMPLEMENT ME") - # archetype_data["distance_meters"] = haversine( - # lat1=property.latitude, lon1=property.longitude, - # lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values - # ) - # expected_sap = np.average( - # archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1) - # ) - # expected_epc = sap_to_epc(expected_sap) - # archetype_surveyed.append( - # { - # "Archetype ID": arch_id, - # "Address ID": property["Address ID"], - # "Current EPC Band": expected_epc - # } - # ) - # archetype_surveyed = pd.DataFrame(archetype_surveyed) - # if archetype_surveyed.empty: - # archetype_surveyed = pd.DataFrame( - # columns=[ - # "Archetype ID", "Address ID", "Current EPC Band", "Current SAP Rating", - # 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', 'Survey: Main Roof Type', - # 'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)' - # ] - # ) - # - # region_assets = region_assets.merge( - # archetype_surveyed, - # on=["Archetype ID", "Address ID"], - # how="left", - # suffixes=("", "_method2") - # ) - # - # region_assets["Confidence Tier"] = np.where( - # region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]) & pd.isnull( - # region_assets["Confidence Tier"]), - # "2 - same archetype", region_assets["Confidence Tier"] - # ) - # - # for col in [ - # 'Current EPC Band', 'Current SAP Rating', - # 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', - # 'Survey: Main Roof Type', 'Survey: Primary Heating System', - # 'Survey: Matching Address ID', 'Distance to Closest Match (m)' - # ]: - # region_assets[col] = np.where( - # pd.isnull(region_assets[col]) & pd.notnull(region_assets[col + "_method2"]), - # region_assets[col + "_method2"], region_assets[col] - # ) - # - # method_2_columns = [c for c in region_assets.columns if c.endswith("_method2")] - # region_assets = region_assets.drop(columns=method_2_columns) - - # We label EPC C properties - # region_assets["Confidence Tier"] = np.where( - # region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]), - # "5 - EPC C or above", region_assets["Confidence Tier"] - # ) - # - # region_assets["Confidence Tier"] = np.where( - # region_assets["Archetype ID"] == "EPC C OR ABOVE", - # "5 - EPC C or above", region_assets["Confidence Tier"] - # ) - # - # region_assets["Current EPC Band"] = np.where( - # region_assets["Archetype ID"] == "EPC C OR ABOVE", - # "C", region_assets["Current EPC Band"] - # ) + region_assets = region_assets.drop(columns=method_1_columns + ["Match Type"]) missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist() @@ -2217,6 +2150,7 @@ def propsed_wave_3_sample(): "Survey: Primary Heating System": "Not Surveyed", "Survey: Matching Address ID": "Not Surveyed", 'Distance to Closest Match (m)': 9999999, + "Package Ref": "Not Surveyed", } ) continue @@ -2261,6 +2195,7 @@ def propsed_wave_3_sample(): "Survey: Primary Heating System": closest_match["Survey: Primary Heating System"], "Survey: Matching Address ID": closest_match["Address ID"], 'Distance to Closest Match (m)': closest_match["distance_meters"], + "Package Ref": closest_match["Package Ref"] } ) continue @@ -2292,8 +2227,10 @@ def propsed_wave_3_sample(): # Check if there are missings in current epc band, current sap rating or any of the survey attributes for c in ( - ["Current EPC Band", "Current SAP Rating", "Survey: Matching Address ID", 'Distance to Closest Match (m)'] + - survey_attribute_columns): + [ + "Current EPC Band", "Current SAP Rating", "Survey: Matching Address ID", 'Distance to Closest Match (m)'] + + survey_attribute_columns + ): if pd.isnull(results[c]).sum(): raise Exception("Something went wrong") @@ -2382,5 +2319,76 @@ def propsed_wave_3_sample(): total_bid_size = bid_size + no_loss_postcodes["Gain"].sum() print(total_bid_size) + # Label final outputs + # We create a summary of packages by street + results["Package Ref"] = results["Package Ref"].fillna("Incomplete") + results["Package Ref"] = results["Package Ref"].astype(str) + package_summary = results.pivot_table( + index='Street and Region', + columns='Package Ref', + aggfunc='size', + fill_value=0 + ).reset_index() + + street_bid_structure = street_summary.merge( + package_summary, how="left", on="Street and Region" + ) + street_bid_structure = street_bid_structure.sort_values("Gain", ascending=False) + street_bid_structure.to_csv( + os.path.join(CUSTOMER_FOLDER_PATH, "Street Bid Structure.csv"), index=False + ) + + individual_units_programme = results.copy() + individual_units_programme["Unit in Programme"] = individual_units_programme["Street and Region"].isin( + street_bid_structure[street_bid_structure["Selected"]]["Street and Region"].values + ) + + # Merge on Stonewaters ID + asset_list_ids = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 " + "- Archetyped V3.1.xlsx", + header=4 + )[["Address ID", "Org. ref."]] + # Clean address ids + asset_list_ids = asset_list_ids[~pd.isnull(asset_list_ids["Address ID"])] + asset_list_ids = asset_list_ids[asset_list_ids["Address ID"] != "Address ID"] + asset_list_ids["Address ID"] = asset_list_ids["Address ID"].astype(int) + individual_units_programme = individual_units_programme.merge( + asset_list_ids, + how="left", + on="Address ID", + ) + + individual_units_programme = individual_units_programme.merge( + asset_list_ids.rename( + columns={"Org. ref.": "Survey: Org. ref.", "Address ID": "Survey: Matching Address ID"} + ), + how="left", + on="Survey: Matching Address ID" + ) + + individual_units_programme["Survey: Org. ref."] = np.where( + (individual_units_programme["Survey: Matching Address ID"] == "Not Surveyed"), + "Not Surveyed", + individual_units_programme["Survey: Org. ref."] + ) + + if pd.isnull(individual_units_programme["Survey: Org. ref."]).sum() or pd.isnull( + individual_units_programme["Org. ref."]).sum(): + raise ValueError("something went wrong") + + for col in ["Survey: Main Roof Type", "Survey: Main Wall Type", "Survey: Main Alternative Wall"]: + individual_units_programme[col] = ( + individual_units_programme[col] + .str.replace(r': nan(?=$|:)', '', regex=True) # Remove ': nan' at the end or before another ':' + .str.replace(r':\s+:', ': ', regex=True) # Replace occurrences of ': :' with ': ' + .str.replace(r'\s+', ' ', regex=True) # Replace multiple spaces with a single space + .str.strip() # Strip leading/trailing spaces + ) + + individual_units_programme.to_csv( + os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme.csv"), index=False + ) + # if __name__ == "__main__": # main() From 1645f9ab9ed84bdb90fa2a732d697111b36bd17b Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 19 Nov 2024 22:00:00 +0000 Subject: [PATCH 28/31] updating stonewater modelling code to use new data --- .../stonewater/Wave 3 Preparation.py | 288 +++++++++++++++--- 1 file changed, 247 insertions(+), 41 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 426097e8..f4195592 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1071,10 +1071,13 @@ def main(): ] # We now merge on the coordinator data so that against each property, we can map the measures + # TODO: Get the pre & post primary energy numbers + # TODO: Make sure the numbers are going down + retrofit_packages_board = pd.read_excel( os.path.join( CUSTOMER_FOLDER_PATH, - "Stonewater_SHDF_3_0_Board_work_in_progress_-_Operations_1731315080 11.11.24.xlsx" + "Stonewater_SHDF_3_0_Board_work_in_progress_-_Operations_1732034933 Final 19.11.24.xlsx" ), header=4 ) @@ -1084,6 +1087,18 @@ def main(): retrofit_packages_board["RA"].isin(["Invoiced", "Completed"]) ] + # populated_primary_energy = retrofit_packages_board[ + # ~pd.isnull(retrofit_packages_board['BASE Primary energy (13a-272)']) + # ] + # + # z = populated_primary_energy[ + # populated_primary_energy['POST Primary energy (13a - 272)'] > populated_primary_energy[ + # 'BASE Primary energy (13a-272)'] + # ] + # + # all(populated_primary_energy['POST Primary energy (13a - 272)'] <= populated_primary_energy[ + # 'BASE Primary energy (13a-272)']) + # Replace \n with "" extracted_data["Postcode"] = extracted_data["Postcode"].str.replace("\n", "") @@ -1192,7 +1207,7 @@ def main(): # missed[["Name", "Postcode", "Archetype ID", "Arch. Group Rank"]].to_csv( # CUSTOMER_FOLDER_PATH + "/missed_debugging.csv") - if len(missing_ids) != 6: + if len(missing_ids) != 1: raise Exception("Unacceptable number of missings") if matching_lookup["Address ID"].duplicated().sum(): @@ -1239,7 +1254,6 @@ def main(): if stonewater_data["Address ID"].duplicated().sum(): raise Exception("Duplicate Address IDs") - # Create a section for costs for measure in measure_columns: stonewater_data[f"Cost of {measure}"] = None @@ -1297,8 +1311,41 @@ def main(): ]: stonewater_data[c] = stonewater_data[c].astype(str) + # FIll the primary energy numbers from the excel + stonewater_data = stonewater_data.merge( + retrofit_packages_board[ + [ + "Name", "Address ID", "BASE Primary energy (13a-272)", "POST Primary energy (13a - 272)" + ] + ], + on=["Address ID", "Name"], + how="left" + ) + stonewater_data["Primary Energy Use (kWh/yr)"] = np.where( + pd.isnull(stonewater_data["Primary Energy Use (kWh/yr)"]), + stonewater_data["BASE Primary energy (13a-272)"], + stonewater_data["Primary Energy Use (kWh/yr)"] + ) + stonewater_data = stonewater_data.drop(columns=["BASE Primary energy (13a-272)"]) + + # Add on organisation reference + original_archetypes = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 " + "- Archetyped V3.1.xlsx", + header=4 + ) + original_archetypes = original_archetypes[~pd.isnull(original_archetypes["Address ID"])] + original_archetypes = original_archetypes[original_archetypes["Address ID"] != "Address ID"] + original_archetypes["Address ID"] = original_archetypes["Address ID"].astype(int) + + stonewater_data = stonewater_data.merge( + original_archetypes[["Address ID", 'Org. ref.']], + on="Address ID", + how="left" + ) + # Save this data to excel - stonewater_data.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages V3.xlsx", index=False) + stonewater_data.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages V4.xlsx", index=False) cost_sheet = [ { @@ -1677,6 +1724,12 @@ def propsed_wave_3_sample(): asset_list = asset_list[asset_list["Address ID"] != "Address ID"] asset_list["Address ID"] = asset_list["Address ID"].astype(int) + asset_list["Street name"] = np.where( + pd.isnull(asset_list["Street name"]), + asset_list["Postcode"], + asset_list["Street name"] + ) + # Create the postal region, taking the first part of the postcode asset_list["Postal Region"] = asset_list["Postcode"].str.split(" ").str[0] asset_list["Street and Region"] = asset_list["Street name"] + " " + asset_list["Postal Region"] @@ -1684,43 +1737,16 @@ def propsed_wave_3_sample(): # Keep just the columns we need asset_list = asset_list[ - ["UPRN", "Address ID", "Archetype ID", "Postal Region", "Postcode", "Street and Region", + ["UPRN", "Address ID", 'Org. ref.', "Archetype ID", "Postal Region", "Name", "Postcode", "Street and Region", "Property Type", "Wall Type", "Roof Type", "Heating"] ] - # Updated packages: to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages V3.xlsx", index=False) survey_results = pd.read_excel( - os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.24.xlsx"), + os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.19 V2.xlsx"), header=13, sheet_name="Modelled Packages" ) - additional_survey_data = pd.read_excel( - os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - costed retrofit packages V3.xlsx"), - header=0 - ) - - survey_results = survey_results.drop( - columns=["Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness"] - ).merge( - additional_survey_data[ - [ - "Address ID", - "Main Wall Type", "Main Wall Insulation_x", "Main Wall Thickness", - "Main Building Alternative Wall Type", "Main Building Alternative Wall Insulation", - "Main Building Alternative Wall Thickness", - "Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness" - ] - ].rename( - columns={ - "Main Wall Insulation_x": "Main Wall Insulation Type", - } - ), - how="left", - on="Address ID" - ) - - # TOOD: We probably want the actual surveyed wall, roof, heating type survey_results = survey_results[ [ "Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Postcode", @@ -1768,6 +1794,105 @@ def propsed_wave_3_sample(): if survey_results_with_original_features.shape[0] != survey_results.shape[0]: raise ValueError("Something went wrong") + # Against properties that have NO package ref, we assign a package ref + properties_with_packages = survey_results_with_original_features[ + ~pd.isnull(survey_results_with_original_features["Package Ref"]) + ] + + properties_without_packages = survey_results_with_original_features[ + (survey_results_with_original_features["Current SAP Rating"] < 69) & pd.isnull( + survey_results_with_original_features["Package Ref"] + ) + ] + + # Change this to a lookup + package_ratings = pd.DataFrame([ + { + "1A": 1, + "1B": 2, + "2A": 3, + "2B": 4, + "3A": 5, + "3B": 6, + 4: 7 + } + ]) + package_ratings = pd.melt(package_ratings, var_name="Package Ref", value_name="Rank") + + mapped_package_refs = [] + for _, property in tqdm(properties_without_packages.iterrows(), total=len(properties_without_packages)): + # Same archetype? + matches = properties_with_packages[properties_with_packages["Archetype ID"] == property["Archetype ID"]] + + if matches.empty: + # Similar property + matches = properties_with_packages[ + (properties_with_packages["Property Type"].str.split(":").str[0] == + property["Property Type"].split(":")[0]) & + (properties_with_packages["Wall Type"] == property["Wall Type"]) & + (properties_with_packages["Roof Type"].str.split(":").str[0] == property["Roof Type"].split(":")[0]) & + (properties_with_packages["Heating"].str.split(":").str[0] == property["Heating"].split(":")[0]) + ] + if matches.empty: + matches = properties_with_packages[ + (properties_with_packages["Property Type"].str.split(":").str[0] == + property["Property Type"].split(":")[0]) & + (properties_with_packages["Wall Type"].str.split(":").str[0] == property["Wall Type"].split(":")[0]) & + (properties_with_packages["Roof Type"].str.split(":").str[0] == property["Roof Type"].split(":")[0]) & + (properties_with_packages["Heating"].str.split(":").str[0] == property["Heating"].split(":")[0]) + ] + if matches.empty: + raise Exception("Implement me") + if matches.shape[0] > 1: + # Take the package with the highest rank + matches = matches.merge( + package_ratings, + on="Package Ref", + how="left" + ).sort_values("Rank", ascending=False).head(1) + + mapped_package_refs.append( + { + "Address ID": property["Address ID"], + "Matched Package Ref": matches["Package Ref"].values[0] + } + ) + + mapped_package_refs = pd.DataFrame(mapped_package_refs) + + survey_results = survey_results.merge( + mapped_package_refs, + on="Address ID", + how="left" + ) + survey_results["Package Ref"] = np.where( + pd.notnull(survey_results["Matched Package Ref"]), + survey_results["Matched Package Ref"], + survey_results["Package Ref"] + ) + survey_results = survey_results.drop(columns=["Matched Package Ref"]) + + # Do the same with survey_results_with_original_features + survey_results_with_original_features = survey_results_with_original_features.merge( + mapped_package_refs, + on="Address ID", + how="left" + ) + survey_results_with_original_features["Package Ref"] = np.where( + pd.notnull(survey_results_with_original_features["Matched Package Ref"]), + survey_results_with_original_features["Matched Package Ref"], + survey_results_with_original_features["Package Ref"] + ) + survey_results_with_original_features = survey_results_with_original_features.drop(columns=["Matched Package Ref"]) + + # Save the data for reference + # mapped_package_refs = mapped_package_refs.merge( + # asset_list[["Name", "Postcode", "Address ID", "Org. ref."]], + # on="Address ID", + # how="left" + # ) + # mapped_package_refs.to_csv(os.path.join(CUSTOMER_FOLDER_PATH, "mapped_package_refs.csv"), index=False) + # We get longitude & Latitude archetyping_spatial_features = read_pickle_from_s3( bucket_name="retrofit-data-dev", s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl", @@ -1911,7 +2036,8 @@ def propsed_wave_3_sample(): 'Current EPC Band', 'Current SAP Rating', 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', 'Survey: Main Roof Type', 'Survey: Primary Heating System', - 'Survey: Matching Address ID', 'Distance to Closest Match (m)' + 'Survey: Matching Address ID', 'Distance to Closest Match (m)', + "Package Ref" ]: region_assets[col] = np.where( pd.isnull(region_assets[col]) & pd.notnull(region_assets[col + suffix]), @@ -2027,7 +2153,7 @@ def propsed_wave_3_sample(): "Archetype ID", "Address ID", "Current EPC Band", "Current SAP Rating", 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', 'Survey: Main Roof Type', 'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)', - "Match Type" + "Match Type", "Package Ref" ] ) @@ -2183,6 +2309,13 @@ def propsed_wave_3_sample(): closest_match = surveyed.iloc[0] + # The closest property may be an EPC C, we we take the package ref from the property that's the nearest + # with non-NA package ref + if expected_epc in ["C", "B", "A"]: + package_ref = None + else: + package_ref = surveyed["Package Ref"].dropna().values[0] + final_missed_matches.append( { "Address ID": a_id, @@ -2195,7 +2328,7 @@ def propsed_wave_3_sample(): "Survey: Primary Heating System": closest_match["Survey: Primary Heating System"], "Survey: Matching Address ID": closest_match["Address ID"], 'Distance to Closest Match (m)': closest_match["distance_meters"], - "Package Ref": closest_match["Package Ref"] + "Package Ref": package_ref } ) continue @@ -2225,6 +2358,11 @@ def propsed_wave_3_sample(): results = pd.concat(results) + results[ + pd.isnull(results["Package Ref"]) & (results["Current EPC Band"] == "D") + ]["Postal Region"] + results[resul] + # Check if there are missings in current epc band, current sap rating or any of the survey attributes for c in ( [ @@ -2269,8 +2407,6 @@ def propsed_wave_3_sample(): street_summary["Gain"] = street_summary[gain_columns].sum(axis=1) street_summary["Loss"] = street_summary[loss_columns].sum(axis=1) - print(street_summary.sum()) - selected_rows, _ = optimise( gain=street_summary["Gain"].values, loss=street_summary["Loss"].values, @@ -2334,9 +2470,6 @@ def propsed_wave_3_sample(): package_summary, how="left", on="Street and Region" ) street_bid_structure = street_bid_structure.sort_values("Gain", ascending=False) - street_bid_structure.to_csv( - os.path.join(CUSTOMER_FOLDER_PATH, "Street Bid Structure.csv"), index=False - ) individual_units_programme = results.copy() individual_units_programme["Unit in Programme"] = individual_units_programme["Street and Region"].isin( @@ -2386,6 +2519,79 @@ def propsed_wave_3_sample(): .str.strip() # Strip leading/trailing spaces ) + # Any EPC C properties that have been included should be flagged as potential low carbon heating + selected_epc_c = individual_units_programme[ + (individual_units_programme["Current EPC Band"].isin(["C", "B", "A", "Needs Survey"])) & + (individual_units_programme["Unit in Programme"]) + ] + + flat_wall_map = { + "CA Cavity: F Filled Cavity": False, + "CA Cavity: A As Built": True, + "SO Solid Brick: A As Built": True, + "Not Surveyed": False + } + + heating_map = { + "BGW Post 98 Combi condens. with auto ign.": False, + "BGB Post 98 Regular condens. with auto ign.": False, + "SEK High heat retention storage heaters": False, + "SEB Modern slimline storage heaters": True, + "Not Surveyed": False + } + + infill_data = [] + for _, epc_c_property in selected_epc_c.iterrows(): + if epc_c_property["Property Type"].split(":")[0] == "Flat": + # Look for a wall insulation measure + infill = flat_wall_map[epc_c_property["Survey: Main Wall Type"]] + infill_data.append( + { + "Address ID": epc_c_property["Address ID"], + "Street and Region": epc_c_property["Street and Region"], + "Possible Flat Infill?": infill + } + ) + continue + + infill = heating_map[epc_c_property["Survey: Primary Heating System"]] + infill_data.append( + { + "Address ID": epc_c_property["Address ID"], + "Street and Region": epc_c_property["Street and Region"], + "Low Carbon Heating Infill?": infill + } + ) + infill_data = pd.DataFrame(infill_data) + + individual_units_programme = individual_units_programme.merge( + infill_data[["Address ID", 'Possible Flat Infill?', 'Low Carbon Heating Infill?']], + how="left", on="Address ID" + ) + + for c in ['Possible Flat Infill?', 'Low Carbon Heating Infill?']: + individual_units_programme[c] = individual_units_programme[c].fillna(False) + + infill_by_street = infill_data.pivot_table( + index='Street and Region', + values=['Possible Flat Infill?', 'Low Carbon Heating Infill?'], + aggfunc='sum', + fill_value=0 + ).reset_index() + + street_bid_structure = street_bid_structure.merge( + infill_by_street, how="left", on="Street and Region" + ) + + for c in ['Low Carbon Heating Infill?', 'Possible Flat Infill?']: + street_bid_structure[c] = street_bid_structure[c].fillna(0) + + street_bid_structure.to_csv( + os.path.join(CUSTOMER_FOLDER_PATH, "Street Bid Structure.csv"), index=False + ) + + # TODO: Add the full Address!!! + individual_units_programme.to_csv( os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme.csv"), index=False ) From 9057b3d4da71f3dd63a8ae2924a073f6cc168dc8 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 19 Nov 2024 22:04:19 +0000 Subject: [PATCH 29/31] fixing assignment of package ref --- etl/customers/stonewater/Wave 3 Preparation.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index f4195592..4a841f61 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -2126,8 +2126,16 @@ def propsed_wave_3_sample(): ) expected_epc = sap_to_epc(expected_sap) + archetype_data = archetype_data.sort_values("distance_meters", ascending=True) + # We take the features of the closest matching property - closest_match = archetype_data.sort_values("distance_meters", ascending=True).iloc[0] + closest_match = archetype_data.iloc[0] + + # Set the package ref + if expected_epc in ["C", "B", "A"]: + package_ref = None + else: + package_ref = archetype_data["Package Ref"].dropna().values[0] region_surveyed.append( { @@ -2141,7 +2149,7 @@ def propsed_wave_3_sample(): 'Survey: Primary Heating System': closest_match["Survey: Primary Heating System"], "Survey: Matching Address ID": closest_match["Address ID"], 'Distance to Closest Match (m)': closest_match["distance_meters"], - "Package Ref": closest_match["Package Ref"], + "Package Ref": package_ref, "Match Type": match_type } ) From 0fafb03deebca4833680594b989b8362386257be Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 19 Nov 2024 22:06:51 +0000 Subject: [PATCH 30/31] tidying up code --- .../stonewater/Wave 3 Preparation.py | 27 ++----------------- 1 file changed, 2 insertions(+), 25 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 4a841f61..34ab778a 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -2366,10 +2366,8 @@ def propsed_wave_3_sample(): results = pd.concat(results) - results[ - pd.isnull(results["Package Ref"]) & (results["Current EPC Band"] == "D") - ]["Postal Region"] - results[resul] + if (pd.isnull(results["Package Ref"]) & (~results["Current EPC Band"].isin(["A", "B", "C"]))).sum(): + raise ValueError("Missing Package Refs") # Check if there are missings in current epc band, current sap rating or any of the survey attributes for c in ( @@ -2442,27 +2440,6 @@ def propsed_wave_3_sample(): ].sum() print("Total needing survey:", total_needing_survey) - # Look for postcodes that have no loss - unselected_streets = street_summary[ - ~street_summary["Selected"] - ]["Street and Region"].values - - postcode_summary = results[ - results["Street and Region"].isin(unselected_streets) - ].pivot_table( - index='Postcode', - columns='Confidence Tier', - aggfunc='size', - fill_value=0 - ).reset_index() - - postcode_summary["Gain"] = postcode_summary[gain_columns].sum(axis=1) - postcode_summary["Loss"] = postcode_summary[loss_columns].sum(axis=1) - - no_loss_postcodes = postcode_summary[postcode_summary["Loss"] == 0].sort_values("Gain", ascending=False) - total_bid_size = bid_size + no_loss_postcodes["Gain"].sum() - print(total_bid_size) - # Label final outputs # We create a summary of packages by street results["Package Ref"] = results["Package Ref"].fillna("Incomplete") From 631a76cb99d213d857c732ea1a58dd9d4291a716 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 21 Nov 2024 11:41:16 +0000 Subject: [PATCH 31/31] stonewater model completed --- etl/customers/ksquared/Wave3 Modelling.py | 35 +++++++++++++++++++ .../stonewater/Wave 3 Preparation.py | 32 +++++++++++------ 2 files changed, 56 insertions(+), 11 deletions(-) diff --git a/etl/customers/ksquared/Wave3 Modelling.py b/etl/customers/ksquared/Wave3 Modelling.py index 96ea2b03..7bfa33b3 100644 --- a/etl/customers/ksquared/Wave3 Modelling.py +++ b/etl/customers/ksquared/Wave3 Modelling.py @@ -8,6 +8,7 @@ from tqdm import tqdm import pandas as pd import numpy as np from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc +from etl.spatial.OpenUprnClient import OpenUprnClient from backend.SearchEpc import SearchEpc from utils.s3 import save_csv_to_s3 @@ -60,6 +61,7 @@ def hornsey(): } extracted_data = [] asset_list = [] + hornsey_asset_list["row_id"] = hornsey_asset_list.index for _, home in tqdm(hornsey_asset_list.iterrows(), total=len(hornsey_asset_list)): if home["Address letter or number"] == "Flat 1 36 Haringey Park": @@ -108,12 +110,24 @@ def hornsey(): asset_list.append( { "uprn": newest_epc["uprn"], + "row_id": home["row_id"], "address": home["Address letter or number"], "postcode": home["Postcode"], "property_type": "Flat", # They're all flats } ) + # Get conservation area data + # uprns = [x["uprn"] for x in extracted_data] + # conservation_area_data = OpenUprnClient.get_spatial_data(uprns, "retrofit-data-dev") + # + # addresses = pd.DataFrame(asset_list) + # addresses["uprn"] = addresses["uprn"].astype(int) + # conservation_area_df = conservation_area_data.merge(addresses, how="left", right_on="uprn", left_on="UPRN") + # conservation_area_df.to_csv( + # "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/hornsey_conservation_area_data.csv" + # ) + # We format the extracted data so that is has the same structure as non-intrusive recommendations # We then get the UPRNs and create the asset list @@ -213,6 +227,8 @@ def caha(): # If pattern doesn't match, return original address return address + caha_asset_list["row_id"] = caha_asset_list.index + extracted_data = [] asset_list = [] for _, home in tqdm(caha_asset_list.iterrows(), total=len(caha_asset_list)): @@ -270,6 +286,7 @@ def caha(): asset_list.append( { + "row_id": home["row_id"], "uprn": uprn, "address": address, "postcode": home["Postcode"], @@ -280,6 +297,24 @@ def caha(): } ) + # Missing row ids + missed = [r for r in caha_asset_list["row_id"].tolist() if r not in [x["row_id"] for x in asset_list]] + + no_data = [x for x in asset_list if x["uprn"] in [None, ""]] + no_data = pd.DataFrame(no_data) + + # Get conservation area data + uprns = [x["uprn"] for x in extracted_data if x["uprn"] not in ["", None]] + conservation_area_data = OpenUprnClient.get_spatial_data([100022526362], "retrofit-data-dev") + + addresses = pd.DataFrame(asset_list) + addresses["uprn"] = addresses["uprn"].astype(str) + conservation_area_data["UPRN"] = conservation_area_data["UPRN"].astype(str) + conservation_area_df = conservation_area_data.merge(addresses, how="left", right_on="uprn", left_on="UPRN") + conservation_area_df.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/caha_conservation_area_data.csv" + ) + non_invasive_recommendations = [ { "uprn": r["uprn"], diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 34ab778a..b6c29863 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -729,6 +729,7 @@ def extract_epr(pdf_path): "Main Building Alternative Wall Insulation": None, "Main Building Alternative Wall Dry-lining": None, "Main Building Alternative Wall Thickness": None, + "Main Fuel": None } with open(pdf_path, "rb") as file: @@ -1086,7 +1087,6 @@ def main(): retrofit_packages_board = retrofit_packages_board[ retrofit_packages_board["RA"].isin(["Invoiced", "Completed"]) ] - # populated_primary_energy = retrofit_packages_board[ # ~pd.isnull(retrofit_packages_board['BASE Primary energy (13a-272)']) # ] @@ -2442,8 +2442,11 @@ def propsed_wave_3_sample(): # Label final outputs # We create a summary of packages by street - results["Package Ref"] = results["Package Ref"].fillna("Incomplete") + results["Package Ref"] = results["Package Ref"].fillna("EPC C - No Package") results["Package Ref"] = results["Package Ref"].astype(str) + results["Package Ref"] = np.where( + results["Package Ref"] == "4.0", "4", results["Package Ref"] + ) package_summary = results.pivot_table( index='Street and Region', columns='Package Ref', @@ -2451,6 +2454,8 @@ def propsed_wave_3_sample(): fill_value=0 ).reset_index() + assert sum([v for k, v in package_summary.sum().items() if k != "Street and Region"]) == results.shape[0] + street_bid_structure = street_summary.merge( package_summary, how="left", on="Street and Region" ) @@ -2471,11 +2476,6 @@ def propsed_wave_3_sample(): asset_list_ids = asset_list_ids[~pd.isnull(asset_list_ids["Address ID"])] asset_list_ids = asset_list_ids[asset_list_ids["Address ID"] != "Address ID"] asset_list_ids["Address ID"] = asset_list_ids["Address ID"].astype(int) - individual_units_programme = individual_units_programme.merge( - asset_list_ids, - how="left", - on="Address ID", - ) individual_units_programme = individual_units_programme.merge( asset_list_ids.rename( @@ -2571,14 +2571,24 @@ def propsed_wave_3_sample(): for c in ['Low Carbon Heating Infill?', 'Possible Flat Infill?']: street_bid_structure[c] = street_bid_structure[c].fillna(0) - street_bid_structure.to_csv( - os.path.join(CUSTOMER_FOLDER_PATH, "Street Bid Structure.csv"), index=False + master_sheet = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - " + "master " + "sheet.csv", + encoding='latin1' + ) + master_sheet = master_sheet[["Address ID", "Main Fuel"]] + + individual_units_programme = individual_units_programme.merge( + master_sheet, how="left", on="Address ID" ) - # TODO: Add the full Address!!! + street_bid_structure.to_csv( + os.path.join(CUSTOMER_FOLDER_PATH, "Street Bid Structure V2.csv"), index=False + ) individual_units_programme.to_csv( - os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme.csv"), index=False + os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme V2.csv"), index=False ) # if __name__ == "__main__":