From fe6e83314f836f8268839b9d45b809bb8c4d83e2 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 11 Nov 2024 16:35:12 +0000 Subject: [PATCH 1/2] working on stonewater matches --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- .../settle/route_march_2024_11_08.py | 226 ++++++++++++++++++ .../stonewater/Wave 3 Preparation.py | 58 ++++- 4 files changed, 282 insertions(+), 6 deletions(-) create mode 100644 etl/customers/settle/route_march_2024_11_08.py diff --git a/.idea/Model.iml b/.idea/Model.iml index df6c4faa..762580d9 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 50cad4ca..c916a158 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/customers/settle/route_march_2024_11_08.py b/etl/customers/settle/route_march_2024_11_08.py new file mode 100644 index 00000000..21b6f2df --- /dev/null +++ b/etl/customers/settle/route_march_2024_11_08.py @@ -0,0 +1,226 @@ +import os +import time + +import pandas as pd +from tqdm import tqdm + +from dotenv import load_dotenv +from utils.s3 import read_excel_from_s3 +from backend.SearchEpc import SearchEpc +from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes + +from recommendations.recommendation_utils import ( + estimate_perimeter, + estimate_external_wall_area, + estimate_number_of_floors +) + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + + +def get_data(asset_list): + epc_data = [] + errors = [] + for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): + try: + postcode = home["Postcode"] + house_number = home["AddressLine1"] + full_address = ", ".join([home["AddressLine1"], home["AddressLine4"], home["AddressLine5"]]) + + searcher = SearchEpc( + address1=str(house_number), + postcode=postcode, + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address, + max_retries=5 + ) + # Force the skipping of estimating the EPC + searcher.ordnance_survey_client.property_type = None + searcher.ordnance_survey_client.built_form = None + + searcher.find_property(skip_os=True) + if searcher.newest_epc is None: + continue + + # Look for EPC recommendatons + try: + property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) + except: + property_recommendations = {"rows": []} + + epc = { + "row_id": home["row_id"], + **searcher.newest_epc.copy(), + "recommendations": property_recommendations["rows"] + } + + epc_data.append(epc) + except Exception as e: + errors.append(home["row_id"]) + time.sleep(5) + + return epc_data, errors + + +def app(): + """ + This app is EPC pulling data for some properties owned by Livewest + + Data request contents: + Date of last EPC + Reason for EPC + SAP score on register + Property Type + Property Area + Property Age + Any Dimensions (HLP,PW,RH) + Property Wall Construction + Heating Type + Secondary Heating + Loft Insulation Depth + + Additional if possible: + Heat loss calculations + EPC recommendations + Property UPRN + + """ + asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Settle/SETTLE FULL PROPOSED PROGRAMME.xlsx", + header=0 + ) + asset_list["row_id"] = asset_list.index + + epc_data, errors = get_data(asset_list) + + # We now retrieve any failed properties + asset_list_failed = asset_list[asset_list["row_id"].isin(errors)] + epc_data_failed, _ = get_data(asset_list_failed) + + # Append the failed data to the main data + epc_data.extend(epc_data_failed) + + epc_df = pd.DataFrame(epc_data) + + # We expand out the recommendations + recommendations_df = epc_df[["row_id", "recommendations"]] + + unique_recommendations = set() + for _, row in recommendations_df.iterrows(): + unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]]) + + columns = ["row_id"] + list(unique_recommendations) + transformed_data = [] + for _, row in recommendations_df.iterrows(): + # Initialize a dictionary for this row with False for all recommendations + row_data = {col: False for col in columns} + row_data["row_id"] = row["row_id"] + + # Set True for each recommendation present in this row + for rec in row["recommendations"]: + recommendation_text = rec["improvement-summary-text"] + row_data[recommendation_text] = True + + # Append the row data to transformed_data + transformed_data.append(row_data) + + transformed_df = pd.DataFrame(transformed_data) + # Drop the column that is "" + transformed_df = transformed_df.drop(columns=[""]) + + # Retrieve just the data we need + epc_df = epc_df[ + [ + "row_id", + "uprn", + "property-type", + "built-form", + "inspection-date", + "current-energy-rating", + "current-energy-efficiency", + "roof-description", + "walls-description", + "transaction-type", + # New fields needed + "secondheat-description", + "total-floor-area", + "construction-age-band", + "floor-height", + "number-habitable-rooms", + "mainheat-description", + # + "energy-consumption-current", # kwh/m2 + ] + ] + + asset_list = asset_list.merge( + epc_df, + how="left", + on="row_id" + ).merge( + transformed_df, + how="left", + on="row_id" + ) + + asset_list = asset_list.drop(columns=["row_id"]) + + # Rename the columns + asset_list = asset_list.rename(columns={ + "inspection-date": "Date of last EPC", + "current-energy-efficiency": "SAP score on register", + "current-energy-rating": "EPC rating on register", + "property-type": "Property Type", + "built-form": "Archetype", + "total-floor-area": "Property Floor Area", + "construction-age-band": "Property Age Band", + "floor-height": "Property Floor Height", + "number-habitable-rooms": "Number of Habitable Rooms", + "walls-description": "Wall Construction", + "roof-description": "Roof Construction", + "mainheat-description": "Heating Type", + "secondheat-description": "Secondary Heating", + "transaction-type": "Reason for last EPC", + "energy-consumption-current": "Heat Demand (kWh/m2)" + }) + + asset_list["Estimated Number of Floors"] = asset_list.apply( + lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull( + x["Property Type"]) else None, axis=1 + ) + + asset_list["Property Floor Area"] = asset_list["Property Floor Area"].astype(float) + # Replace "" value with None + asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].replace("", None) + asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].astype(float) + + asset_list["Estimated Perimeter (m)"] = asset_list.apply( + lambda x: estimate_perimeter( + floor_area=x["Property Floor Area"] / x["Estimated Number of Floors"], + num_rooms=x["Number of Habitable Rooms"] / x["Estimated Number of Floors"], + ), axis=1 + ) + + asset_list["Estimated Heat Loss Perimeter (m2)"] = asset_list.apply( + lambda x: estimate_external_wall_area( + num_floors=x["Estimated Number of Floors"], + floor_height=float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5, + perimeter=x["Estimated Perimeter (m)"], + built_form=x["Archetype"] + ), + axis=1 + ) + + asset_list["Roof Insulation Thickness"] = asset_list.apply( + lambda x: RoofAttributes(description=x["Roof Construction"]).process()["insulation_thickness"] if not pd.isnull( + x["Roof Construction"]) else None, + axis=1 + ) + + # Store as an excel + filename = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Settle/Settle EPC Data pull - 08 Nov 2024.xlsx" + asset_list.to_excel(filename, index=False) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 9f929db1..0036a0a4 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -8,7 +8,7 @@ from collections import Counter CUSTOMER_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater" SURVEY_FOLDERS = os.path.join(CUSTOMER_FOLDER_PATH, "StonewaterSurveys_{i}") -NUM_FOLDERS = 14 +NUM_FOLDERS = 15 def sap_to_epc(sap_points: int | float): @@ -871,7 +871,10 @@ def main(): # We now merge on the coordinator data so that against each property, we can map the measures retrofit_packages_board = pd.read_excel( - os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater 3.0 Updated SAP Pre & Modelled 29.10.24.xlsx"), + os.path.join( + CUSTOMER_FOLDER_PATH, + "Stonewater_SHDF_3_0_Board_work_in_progress_-_Operations_1731315080 11.11.24.xlsx" + ), header=4 ) retrofit_packages_board = retrofit_packages_board[~pd.isnull(retrofit_packages_board["Name"])] @@ -902,13 +905,24 @@ def main(): # '102 Cheaton Close': '', # 'Flat 16 Spring Gardens': '', # '4 Apple Close': '', - '25 Folly Lane': '', - + # '25 Folly Lane': '', + '2 Calshot Walk': 'StonewaterSurveys_3/156-3-2 Calshot Walk-MK41 8QS', + '21 Constitution Hill': 'StonewaterSurveys_1/112-11-21 Constitution Hill-BH14 0PX', + '22 Constitution Hill': 'StonewaterSurveys_4/185-8-22 Constitution Hill-BH14 0PX', + '2 Marches Cottages, School Lane, Leominster': 'StonewaterSurveys_5/224-1-2 School Lane-HR6 8AA', + '26, Copthorn House, Brighton Road': 'StonewaterSurveys_15/133-1-26 Brighton Road-KT20 6BQ', + '4, Old St Marys, Ripley Lane': "StonewaterSurveys_15/433-3-4 Ripley Lane-KT24 6JG", + '1 Nelson House, Short Street': 'StonewaterSurveys_15/89-2-1 Short Street-GU11 1HX', + "18 Nelson House, Short Street": 'StonewaterSurveys_15/25-3- 18 Short Street- GU11 1HX', + '3 Nelson House, Short Street': 'StonewaterSurveys_2/138-1-3 Short Street-GU11 1HX', + '16, Copthorn House, Brighton Road': 'StonewaterSurveys_13/78-3-16 Brighton Road-KT20 6BQ', + '20 Nelson House, Short Street': 'StonewaterSurveys_15/89-1-20 Short Street-GU11 1HX' } # We now match this retrofit packages board to the extracted data matching_lookup = [] for _, home in tqdm(retrofit_packages_board.iterrows(), total=len(retrofit_packages_board)): + # Handle the case that has the wrong postcode in the asset data if home["Name"] in manual_filters: filtered = extracted_data[extracted_data["survey_folder"] == manual_filters[home["Name"]]].copy() @@ -972,6 +986,10 @@ def main(): missing_ids = list(missing_ids) if missing_ids: # We check that the missing ids have no data yet + missed = retrofit_packages_board[retrofit_packages_board["Address ID"].isin(missing_ids)] + missed[["Name", "Postcode", "Archetype ID", "Arch. Group Rank"]].to_csv( + CUSTOMER_FOLDER_PATH + "/missed_debugging.csv") + if len(missing_ids) != 8: raise Exception("Unacceptable number of missings") @@ -1316,5 +1334,37 @@ def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepa # Save excel proposed_sample.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - Proposed Wave 3 Bid (WIP).xlsx", index=False) + +def find_remaining_surveys(): + """ + This compares a list of properties that have been surveyed against a list of properties that I have produced + costed retrofit packages for, so I know what needs to be downloaded from Sharepoint + :return: + """ + + surveyed = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater" + "/Stonewater_SHDF_3_0_Board_work_in_progress_- 07.11.24.xlsx", + header=4 + ) + + costed = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater - Costed Retrofit Packages " + "20241030 (WIP) MR Review v1.xlsx", + header=13, + sheet_name="Modelled Packages" + ) + costed = costed[~pd.isnull(costed["Address ID"])] + + needed = surveyed[~surveyed["Address ID"].isin(costed["Address ID"])] + + needed["id"] = needed["Archetype ID"].astype(str) + "-" + needed["Arch. Group Rank"].astype(str) + needed = needed.sort_values("id", ascending=True) + needed[["id", "Name", "Postcode"]].to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/needed_surveys.csv" + ) + + assert needed.shape[0] + costed.shape[0] == surveyed.shape[0] + # if __name__ == "__main__": # main() From dfa37f86d469d4ee926ee0dc2438629fb35e17cc Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 12 Nov 2024 15:49:28 +0000 Subject: [PATCH 2/2] Adding postcode summary to stonewater --- .../stonewater/Wave 3 Preparation.py | 79 +++++++++++++++---- 1 file changed, 62 insertions(+), 17 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 0036a0a4..889d8f88 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -916,13 +916,14 @@ def main(): "18 Nelson House, Short Street": 'StonewaterSurveys_15/25-3- 18 Short Street- GU11 1HX', '3 Nelson House, Short Street': 'StonewaterSurveys_2/138-1-3 Short Street-GU11 1HX', '16, Copthorn House, Brighton Road': 'StonewaterSurveys_13/78-3-16 Brighton Road-KT20 6BQ', - '20 Nelson House, Short Street': 'StonewaterSurveys_15/89-1-20 Short Street-GU11 1HX' + '20 Nelson House, Short Street': 'StonewaterSurveys_15/89-1-20 Short Street-GU11 1HX', + '7 Croft Street': 'StonewaterSurveys_8/333-2-7 Croft Street-HR6 8LA' } # We now match this retrofit packages board to the extracted data matching_lookup = [] for _, home in tqdm(retrofit_packages_board.iterrows(), total=len(retrofit_packages_board)): - + # Handle the case that has the wrong postcode in the asset data if home["Name"] in manual_filters: filtered = extracted_data[extracted_data["survey_folder"] == manual_filters[home["Name"]]].copy() @@ -986,11 +987,11 @@ def main(): missing_ids = list(missing_ids) if missing_ids: # We check that the missing ids have no data yet - missed = retrofit_packages_board[retrofit_packages_board["Address ID"].isin(missing_ids)] - missed[["Name", "Postcode", "Archetype ID", "Arch. Group Rank"]].to_csv( - CUSTOMER_FOLDER_PATH + "/missed_debugging.csv") + # missed = retrofit_packages_board[retrofit_packages_board["Address ID"].isin(missing_ids)] + # missed[["Name", "Postcode", "Archetype ID", "Arch. Group Rank"]].to_csv( + # CUSTOMER_FOLDER_PATH + "/missed_debugging.csv") - if len(missing_ids) != 8: + if len(missing_ids) != 6: raise Exception("Unacceptable number of missings") if matching_lookup["Address ID"].duplicated().sum(): @@ -1083,12 +1084,20 @@ def main(): stonewater_data["Package Includes Windows"] = ~pd.isnull(stonewater_data["Window Upgrade"]) windows_data["Address ID"] = windows_data["Address ID"].astype(float) stonewater_data = stonewater_data.merge(windows_data, on="Address ID", how="left") + stonewater_data = stonewater_data.sort_values("Archetype ID", ascending=True) if stonewater_data["Address ID"].duplicated().sum(): raise Exception("Duplicate Address IDs") + for c in [ + 'Window attributes - Fitted/renewed date', + 'Parent Asset Window attributes - Fitted/renewed date', + 'Fitted/renewed date' + ]: + stonewater_data[c] = stonewater_data[c].astype(str) + # Save this data to excel - stonewater_data.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages.xlsx", index=False) + stonewater_data.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages V2.xlsx", index=False) cost_sheet = [ { @@ -1173,7 +1182,7 @@ def main(): create_proposed_wave_3_bid( costed_packages_filepath=os.path.join( - CUSTOMER_FOLDER_PATH, "Stonewater - Costed Retrofit Packages 20241030 (WIP) MR Review v1.xlsx" + CUSTOMER_FOLDER_PATH, "Stonewater - Costed Retrofit Packages 20241030 (WIP) Single Model V3.xlsx" ), archetypes_sheet_filepath=os.path.join( CUSTOMER_FOLDER_PATH, "Stonewater SHDF_3_0_Board Triage 22.05.24 - Archetyped V3.1.xlsx" @@ -1183,8 +1192,8 @@ def main(): def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepath): # We read in the costed packages - # Note: Header as 12 is for Matt Ratcliff's reviewed version costed_packages = pd.read_excel(costed_packages_filepath, header=13, sheet_name="Modelled Packages") + costed_packages = costed_packages[~pd.isnull(costed_packages["Address"])] archetypes_to_cost = costed_packages[ [ @@ -1213,16 +1222,11 @@ def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepa 'Existing Primary Heating System', 'Existing Primary Heating PCDF Reference']) - # We take properties that are EPC D and below (61% of units) + # We take properties that are EPC D and below (59% of units) archetypes_to_cost = archetypes_to_cost[archetypes_to_cost["Current EPC Band"].isin(["D", "E", "F", "G"])] archetypes_to_cost["Has been modelled"] = ~pd.isnull(archetypes_to_cost["Modelled SAP Band"]) - average_cost = archetypes_to_cost[ - archetypes_to_cost["Has been modelled"] - ]['Total Cost of Measures inc Contingency'].mean() - print(average_cost) - # These are the Arhetypes that will likely be suitable for Wave 3 archetypes_sheet = pd.read_excel(archetypes_sheet_filepath, header=4) archetypes_sheet = archetypes_sheet[~pd.isnull(archetypes_sheet["Address ID"])] @@ -1236,7 +1240,21 @@ def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepa how="left" ) - proposed_sample = archetypes_sheet[archetypes_sheet["Archetype ID"].isin(archetypes_to_cost["Archetype ID"])] + proposed_sample = archetypes_sheet[ + archetypes_sheet["Archetype ID"].astype(str).isin(archetypes_to_cost["Archetype ID"].astype(int).astype(str)) + ] + + not_proposed = archetypes_sheet[ + ~archetypes_sheet["Archetype ID"].astype(str).isin(archetypes_to_cost["Archetype ID"].astype(int).astype(str)) + ] + + # archetypes_without_survey = [] + # for p in list(set(not_proposed)): + # filtered = costed_packages[costed_packages["Archetype ID"].astype(int).astype(str) == p] + # if filtered.empty: + # archetypes_without_survey.append(p) + + # Can we propose anything about archetypes that were not surveyed? proposed_sample = proposed_sample[ [ @@ -1247,6 +1265,8 @@ def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepa # We classify into high and low confidence + archetypes_to_cost["Surveyed Main Roof"] = archetypes_to_cost["Surveyed Main Roof"].fillna("") + match_classification = [] for _, home in tqdm(proposed_sample.iterrows(), total=len(proposed_sample)): @@ -1331,8 +1351,33 @@ def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepa None, proposed_sample["Total Cost of Measures inc Contingency"] ) + proposed_sample = proposed_sample.sort_values("Archetype ID", ascending=True) + # Save excel - proposed_sample.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - Proposed Wave 3 Bid (WIP).xlsx", index=False) + proposed_sample.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - Proposed Wave 3 Bid V2 (WIP).xlsx", index=False) + + # For each postcode that's in the bid, we also summarise the number of units in the bid and number left out + proposed_sample_postcodes = proposed_sample["Postcode"].unique() + + postcode_summary = [] + for postcode in proposed_sample_postcodes: + in_proposal = proposed_sample[proposed_sample["Postcode"] == postcode] + not_in_proposal = not_proposed[not_proposed["Postcode"] == postcode] + postcode_summary.append( + { + "Postcode": postcode, + "Number of properties in Proposal": len(in_proposal), + "Number of properties not in Proposal": len(not_in_proposal) + } + ) + postcode_summary = pd.DataFrame(postcode_summary) + postcode_summary = postcode_summary.sort_values( + "Number of properties not in Proposal", + ascending=False).reset_index(drop=True) + + postcode_summary.to_excel( + CUSTOMER_FOLDER_PATH + "/Stonewater - Proposed Wave 3 Bid Postcode Summary.xlsx", index=False + ) def find_remaining_surveys():