From 11a4bc24a1903f4f384aef48fd006ca8c17c28e8 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 28 Jan 2025 17:42:19 +0000 Subject: [PATCH 01/72] anonymised sharepoint keys --- .../panacap_ventures/sample_remote_assessments.py | 1 + etl/customers/stonewater/Wave 3 Preparation.py | 1 - etl/customers/stonewater/data_cleaning.py | 15 ++++++++++----- 3 files changed, 11 insertions(+), 6 deletions(-) create mode 100644 etl/customers/panacap_ventures/sample_remote_assessments.py diff --git a/etl/customers/panacap_ventures/sample_remote_assessments.py b/etl/customers/panacap_ventures/sample_remote_assessments.py new file mode 100644 index 00000000..1a5ddff7 --- /dev/null +++ b/etl/customers/panacap_ventures/sample_remote_assessments.py @@ -0,0 +1 @@ +# The address we're looking from for the remote assessments is Natwest House, Shenley Rd, Borehamwood WD6 1DL diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 8538188b..b1bf0638 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -2936,7 +2936,6 @@ def revised_model(): missed = original_archetypes[ ~original_archetypes["Address ID"].isin(new_priority_postcodes["Address ID"].values) ]["Archetype ID"].unique() - assert # if __name__ == "__main__": # main() diff --git a/etl/customers/stonewater/data_cleaning.py b/etl/customers/stonewater/data_cleaning.py index 8751960c..7ee06fcd 100644 --- a/etl/customers/stonewater/data_cleaning.py +++ b/etl/customers/stonewater/data_cleaning.py @@ -1,6 +1,7 @@ import os import shutil from tqdm import tqdm +from etl.access_reporting.app import SharePointClient def delete_large_files(): @@ -66,13 +67,17 @@ def delete_large_files(): def download_data_from_sharepoint(): # Given a sharepoint location, this function will download the retrofit assessment folders from the locations # specified in the sharepoint location - from etl.access_reporting.app import SharePointClient + + SHAREPOINT_CLIENT_ID = os.getenv("SHAREPOINT_CLIENT_ID", None) + SHAREPOINT_CLIENT_SECRET = os.getenv("SHAREPOINT_CLIENT_SECRET", None) + SHAREPOINT_TENANT_ID = os.getenv("SHAREPOINT_TENANT_ID", None) + OSMOSIS_SHAREPOINT_SITE_ID = os.getenv("OSMOSIS_SHAREPOINT_SITE_ID", None) sharepoint_client = SharePointClient( - tenant_id="10d5af8b-2cfd-4882-9ccd-b96e4812dacf", - client_id="6832a4c5-fb8c-4082-a746-4f51e1020f0d", - client_secret="xpC8Q~Frww48SM1V-D8lGy5iOY7P_cJ7FF3jgarQ", - site_id="bc925a9a-ad0b-4de9-9a3c-e61014cc7489" + tenant_id=SHAREPOINT_TENANT_ID, + client_id=SHAREPOINT_CLIENT_ID, + client_secret=SHAREPOINT_CLIENT_SECRET, + site_id=OSMOSIS_SHAREPOINT_SITE_ID ) # Retrieve the data from Sharepoint and write to local machine From 86deed8115c8b630ca5516f113ec5beb585460e0 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 28 Jan 2025 18:13:07 +0000 Subject: [PATCH 02/72] setting up the stonewater assessment extraction process --- .../sample_remote_assessments.py | 1 - .../stonewater/Wave 3 Preparation.py | 116 +++++++++++++++++- 2 files changed, 112 insertions(+), 5 deletions(-) delete mode 100644 etl/customers/panacap_ventures/sample_remote_assessments.py diff --git a/etl/customers/panacap_ventures/sample_remote_assessments.py b/etl/customers/panacap_ventures/sample_remote_assessments.py deleted file mode 100644 index 1a5ddff7..00000000 --- a/etl/customers/panacap_ventures/sample_remote_assessments.py +++ /dev/null @@ -1 +0,0 @@ -# The address we're looking from for the remote assessments is Natwest House, Shenley Rd, Borehamwood WD6 1DL diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index b1bf0638..105628e9 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -2928,14 +2928,122 @@ def revised_model(): original_archetypes = original_archetypes[original_archetypes["Address ID"] != "Address ID"] original_archetypes["Address ID"] = original_archetypes["Address ID"].astype(int) - original_archetypes = original_archetypes[ - ["Address ID", "Archetype ID", ""] - ] - # Check if we have all of the addresses missed = original_archetypes[ ~original_archetypes["Address ID"].isin(new_priority_postcodes["Address ID"].values) ]["Archetype ID"].unique() + assert set(missed) == {'NOT PRIORITY POSTCODE', 'IN WAVE 2.1', 'EPC C OR ABOVE'} + + original_archetypes = original_archetypes[ + ["Address ID", "Archetype ID", "Archetype Group Rank"] + ] + + # Merge these archetypes on to the new priority postcodes + new_priority_postcodes = new_priority_postcodes.merge( + original_archetypes, how="left", on="Address ID" + ) + + # Basic check, should have no rows with missing Archetype ID, where + assert float(new_priority_postcodes[pd.isnull(new_priority_postcodes["Archetype ID"])]["Address ID"].isin( + original_archetypes["Address ID"] + ).sum()) == 0 + + # We pull together the survey data sheet + survey_folders = [] + + # Loop over each survey folder and list its contents + for i in range(1, NUM_FOLDERS + 1): + folder_path = os.path.join(CUSTOMER_FOLDER_PATH, f"StonewaterSurveys_{i}") + if os.path.isdir(folder_path): # Check if folder exists + folder_contents = [os.path.join(f"StonewaterSurveys_{i}", file) for file in os.listdir(folder_path)] + survey_folders.extend(folder_contents) # Append contents to the master list + + wave_21_folders = [ + "1. Herefordshire", + "2. Bedfordshire", + "3. Wiltshire", + "4. Bournemouth", + "5. Coventry", + "6. West Sussex", + "7. Dorset", + "8. Cambridgeshire", + "9. Guildford", + "10. Little Island", + "11. CCS Dorset" + ] + + for wave_2_1_folder in wave_21_folders: + folder_path = os.path.join(CUSTOMER_FOLDER_PATH, "Wave 2.1 Surveys", wave_2_1_folder) + if os.path.isdir(folder_path): # Check if folder exists + folder_contents = [os.path.join("Wave 2.1 Surveys", wave_2_1_folder, file) for file in + os.listdir(folder_path)] + survey_folders.extend(folder_contents) # Append contents to the master list + + # We now do a large pull of all of the data + extracted_data = [] + for survey_folder in tqdm(survey_folders): + survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder) + + # List the folders inside of the survey folder + survey_subfolders = [ + name for name in os.listdir(survey_folder_path) + if os.path.isdir(os.path.join(survey_folder_path, name)) + ] + + # Check if there's a "retrofit assessment" folder + retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None) + + ra_folder = next( + (name for name in survey_subfolders if "ra coordinator info" in name.lower() or "ra info" in name.lower()), + None + ) + + # If retrofit assessment folder exists, check if it has content + if retrofit_folder or ra_folder: + if retrofit_folder: + retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder) + else: + retrofit_folder_path = os.path.join(survey_folder_path, ra_folder) + + # Check if everything inside is a sub-folder and the number of folders is 2 + items = [item for item in os.listdir(retrofit_folder_path) if item != '.DS_Store'] + all_folders = [os.path.isdir(os.path.join(retrofit_folder_path, item)) for item in items] + if all(all_folders) and len(all_folders) == 2 and "Property Pics" in items: + # Get the folder that isn't Property Pics + retrofit_folder_path = os.path.join( + retrofit_folder_path, [item for item in items if item != "Property Pics"][0] + ) + + if os.listdir(retrofit_folder_path): # If not empty + summary_data = extract_retrofit_pdfs(data_folder_path=retrofit_folder_path) + if summary_data: + summary_data = { + "survey_folder": survey_folder, + **summary_data, + } + extracted_data.append(summary_data) + continue + else: + # Then we have an empty Retrofit Assessment folder + continue + + # If no retrofit folder or it was empty, check files in survey_folder + summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path) + if not summary_data: + if len(survey_subfolders) == 1: + survey_folder_path = os.path.join(survey_folder_path, survey_subfolders[0]) + summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path) + + if summary_data: + summary_data = { + "survey_folder": survey_folder, + **summary_data, + } + extracted_data.append(summary_data) + + retrofit_assessment_data = pd.DataFrame(extracted_data) + # TODO - Save this data + # if __name__ == "__main__": # main() From ca7a0e9d107c7da66fd7a8d5066834b7dbf00978 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 28 Jan 2025 22:15:53 +0000 Subject: [PATCH 03/72] debugging extract epr for old elmhurst epr --- .../stonewater/Wave 3 Preparation.py | 29 +++++++++++++++++-- etl/route_march_data_pull/app.py | 18 +++++------- 2 files changed, 34 insertions(+), 13 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 105628e9..ee314f17 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -747,12 +747,30 @@ def extract_epr(pdf_path): # Extract Current and Potential SAP ratings sap_match = re.search(r"GG \(1-20\)\s*(\d{1,2})\s*(\d{1,2})", text) - current_sap, _ = int(sap_match.group(1)), int(sap_match.group(2)) - data["Current SAP Rating"] = current_sap + if sap_match is None: + # Handles the older format of the elmhurst EPR + # The text will look something like this: + # Least energy efficient - higher running costsD 61 - we extract D 61 + sap_match = re.search( + r"(?P[A-G])\s(?P\d{1,3})(?P[A-G])\s(?P\d{1,3})", + text) + data["Current EPC Band"] = sap_match.group("current_epc") + data["Current SAP Rating"] = int(sap_match.group("current_sap")) + else: + current_sap, _ = int(sap_match.group(1)), int(sap_match.group(2)) + data["Current SAP Rating"] = current_sap # Extract the primary energy use intensity additional_rating_match = re.search(r"Additional ratings for your home\s*([\d.]+)", text) - data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(additional_rating_match.group(1)) + if additional_rating_match: + data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(additional_rating_match.group(1)) + else: + # Handles the older format of the Elmhurst EPR + primary_energy_match = re.search(r"actual consumption\.\n(?P\d+)", text) + data["Primary Energy Use (kWh/yr)"] = int(primary_energy_match.group("primary_energy")) + # We calculate the primary energy use intensity by dividing by floor area + floor_area = re.search(r"Total Floor Area\s(?P\d+)\s?m2", text).group("floor_area") + data["Primary Energy Use Intensity (kWh/m2/yr)"] = data["Primary Energy Use (kWh/yr)"] / int(floor_area) # Extract Number of Storeys storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) @@ -2983,8 +3001,13 @@ def revised_model(): # We now do a large pull of all of the data extracted_data = [] for survey_folder in tqdm(survey_folders): + survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder) + # Check that the survey folder is actually a folder + if not os.path.isdir(survey_folder_path): + continue + # List the folders inside of the survey folder survey_subfolders = [ name for name in os.listdir(survey_folder_path) diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 8d19aa84..247ce98c 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -162,19 +162,17 @@ def app(): Property UPRN """ - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Southern" - DATA_FILENAME = "January 2025 Additions Query.xlsx" - SHEET_NAME = "Jan 2025 additions" + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/For Housing" + DATA_FILENAME = "For Housing Data pull.xlsx" + SHEET_NAME = "Sheet1" POSTCODE_COLUMN = "Post Code" - FULLADDRESS_COLUMN = "Street / Block Name" - ADDRESS1_COLUMN = None - ADDRESS1_METHOD = "first_word" - ADDRESS_COLS_TO_CONCAT = [] + FULLADDRESS_COLUMN = None + ADDRESS1_COLUMN = "NO." + ADDRESS1_METHOD = None + ADDRESS_COLS_TO_CONCAT = ["NO.", "Street / Block Name"] # Maps addresses to uprn in problematic cases - MANUAL_UPRN_MAP = { - "Ardelagh Ardelagh Faris Lane Woodham Addlestone KT15 3DJ": 100061484560 - } + MANUAL_UPRN_MAP = {} asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME) asset_list = asset_list[~pd.isnull(asset_list[POSTCODE_COLUMN])].reset_index() From fd98721748c9da95c3660116f33b6aa00d1be01f Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 29 Jan 2025 15:24:02 +0000 Subject: [PATCH 04/72] debugging epr extraction when the dimensions are external --- etl/customers/stonewater/Wave 3 Preparation.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index ee314f17..4db089e7 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -465,7 +465,11 @@ def extract_building_parts_summary(text): r"Dimensions:\s*Dimension type: Internal\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL ) if not dimensions_section: - raise ValueError("Failed to locate dimensions section in the text.") + dimensions_section = re.search( + r"Dimensions:\s*Dimension type: External\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL + ) + if not dimensions_section: + raise ValueError("Failed to locate dimensions section in the text.") dimensions_text = dimensions_section.group(1) @@ -898,11 +902,18 @@ def detect_report_type(pdf_path, pdf_file): """ # Attempt to read the first page of the PDF to determine type with open(pdf_path, "rb") as file: + # This code raises some warnings like Multiple definitions in dictionary at byte 0x1ab for key /Filter + # This is because the pdf is irregular. We could possibly try a library like fitz to handle this reader = PyPDF2.PdfReader(file) first_page_text = reader.pages[0].extract_text() if reader.pages else "" + n_pages = len(reader.pages) - if is_energy_report(first_page_text): + if is_energy_report(first_page_text) and n_pages > 3: + # The EPR should have more than 3 pages return "epr" + elif is_energy_report(first_page_text) and n_pages <= 3: + # This is a shortened version of the EPR which isn't massively useful + return "short_form_epr" elif "summary" in pdf_file.lower() or is_summary_report(first_page_text): return "summary" elif is_condition_report(first_page_text): From 231069f4e3e4ca2a40e114db0963c55aa56b09b7 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 29 Jan 2025 20:37:06 +0000 Subject: [PATCH 05/72] matching algorithm wip --- .../stonewater/Wave 3 Preparation.py | 275 +++++++++++++++++- 1 file changed, 274 insertions(+), 1 deletion(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 4db089e7..904afd30 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3077,7 +3077,280 @@ def revised_model(): extracted_data.append(summary_data) retrofit_assessment_data = pd.DataFrame(extracted_data) - # TODO - Save this data + + # Remove some definite duplicates + dupes = retrofit_assessment_data[retrofit_assessment_data["Address"].duplicated()]["Address"] + dupes = retrofit_assessment_data[retrofit_assessment_data["Address"].isin(dupes)] + dupes = dupes.sort_values("Address") + # Get all of the folders that end with ROSS + to_drop = dupes[dupes["survey_folder"].str.endswith("ROSS")]["survey_folder"].unique().tolist() + + retrofit_assessment_data = retrofit_assessment_data[ + ~retrofit_assessment_data["survey_folder"].isin( + [ + "StonewaterSurveys_10/4 Beech Road, LUTON, LU1 1DP ROSS", + "StonewaterSurveys_2/135 Runley Road, LUTON, LU1 1TX ROSS", + "StonewaterSurveys_13/7 Saxon Road, LUTON, LU3 1JR ROSS" + ] + to_drop + ) + ] + # Replace \n with "" + retrofit_assessment_data["Postcode"] = retrofit_assessment_data["Postcode"].str.replace("\n", "") + + # retrofit_assessment_data.to_csv( + # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet.csv"), index=False + # ) + + # We can read in the data as needed + + # Next Step: Read in the coordinated measures and match to the extracted data + ############################################################ + # CCS + ############################################################# + ccs_coordination_sheet = pd.read_excel( + os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "CCS_Installation_Compliance_CCS.xlsx"), + header=4 + ) + ccs_coordination_sheet["contractor"] = "CCS" + # We split ccs into two sections - the first being + ccs_coordination_removed_from_programme = ccs_coordination_sheet.tail(21) + ccs_coordination_sheet = ccs_coordination_sheet.head(87) + ccs_coordination = pd.concat([ccs_coordination_removed_from_programme, ccs_coordination_sheet]) + + ############################################################ + # WATES + ############################################################# + wates_coordination_sheet = pd.read_excel( + os.path.join( + CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "Stonewater_SAP_Installation_Compliance_Vinci-Wates.xlsx" + ), + header=4 + ) + wates_coordination_sheet["contractor"] = "Wates" + # Break into the different sites: + # Wiltshire + wates_coordination_sheet_wiltshere = wates_coordination_sheet.head(267) + wates_coordination_sheet_herefordshire = wates_coordination_sheet.iloc[271:332, :] + wates_coordination_sheet_coventry = wates_coordination_sheet.iloc[336:409, :] + wates_coordination_sheet_bedfordshire = wates_coordination_sheet.iloc[413:520, :] + wates_coordination_sheet_bournemouth = wates_coordination_sheet.iloc[524:567, :] + wates_coordination_sheet_cambridgeshire = wates_coordination_sheet.iloc[571:581, :] + wates_coordination_sheet_removed_from_programme = wates_coordination_sheet.iloc[586:926, :] + wates_coordination_sheet_abeyance = wates_coordination_sheet.iloc[928:972, :] + + wates_coordination = pd.concat( + [ + wates_coordination_sheet_wiltshere, + wates_coordination_sheet_herefordshire, + wates_coordination_sheet_coventry, + wates_coordination_sheet_bedfordshire, + wates_coordination_sheet_bournemouth, + wates_coordination_sheet_cambridgeshire, + wates_coordination_sheet_removed_from_programme, + wates_coordination_sheet_abeyance + ] + ) + + # Combine the data back + + ############################################################ + # NEW 450 COORDINATED RETROFIT ASSESSMENTS + ############################################################# + + retrofit_packages_board = pd.read_excel( + os.path.join( + CUSTOMER_FOLDER_PATH, + "Stonewater_SHDF_3_0_Board_work_in_progress_-_Operations_1732034933 Final 19.11.24.xlsx" + ), + header=4 + ) + retrofit_packages_board = retrofit_packages_board[~pd.isnull(retrofit_packages_board["Name"])] + # Take just the rows that have been surveyed + retrofit_packages_board = retrofit_packages_board[ + retrofit_packages_board["RA"].isin(["Invoiced", "Completed"]) + ] + + manual_filters = { + "Flat 21 Walmer Street": "StonewaterSurveys_14/91-1-Flat 21 Walmer Street-HR4 9JD", + "6 Cornewall Close": "StonewaterSurveys_14/aa 6, Cornewall Close, Moccas, HEREFORD, HR2 9LG", + "2 Bromyard Road": "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ", + 'Flat 18, 1 Raglan Court': "StonewaterSurveys_13/60-3-18 Raglan Court, 1 Raglan Court-MK41 8QT", + '14 Raglan Court, 1 Devizes Avenue': 'StonewaterSurveys_12/55-3-14 Raglan Court, Devizes Avenue-MK41 8QT', + '19 South Road': 'StonewaterSurveys_4/19 The Oaks, South Road, SMETHWICK, B67 7BY', + 'Flat 12 Pelican Lane': 'StonewaterSurveys_1/121-3-Flat 12 Lynton Court, Pelican Lane-RG14 1NN', + 'Flat C, 44 St Leonards Avenue': 'StonewaterSurveys_11/427-2-44c St. Leonards Avenue-MK42 0RB', + '16 The Crescent, Kington': 'StonewaterSurveys_9/360-3-16 The Crescent-HR5 3AS', + '2 School Lane, Leominster': 'StonewaterSurveys_5/224-1-2 School Lane-HR6 8AA', + '14 South Road': 'StonewaterSurveys_2/14 The Oaks, South Road, SMETHWICK, B67 7BY', + '1 Groves Street': 'StonewaterSurveys_4/19-5-1 Groves Street-SN2 2BW', + '2 Calshot Walk': 'StonewaterSurveys_3/156-3-2 Calshot Walk-MK41 8QS', + '21 Constitution Hill': 'StonewaterSurveys_1/112-11-21 Constitution Hill-BH14 0PX', + '22 Constitution Hill': 'StonewaterSurveys_4/185-8-22 Constitution Hill-BH14 0PX', + '2 Marches Cottages, School Lane, Leominster': 'StonewaterSurveys_5/224-1-2 School Lane-HR6 8AA', + '26, Copthorn House, Brighton Road': 'StonewaterSurveys_15/133-1-26 Brighton Road-KT20 6BQ', + '4, Old St Marys, Ripley Lane': "StonewaterSurveys_15/433-3-4 Ripley Lane-KT24 6JG", + '1 Nelson House, Short Street': 'StonewaterSurveys_15/89-2-1 Short Street-GU11 1HX', + "18 Nelson House, Short Street": 'StonewaterSurveys_15/25-3- 18 Short Street- GU11 1HX', + '3 Nelson House, Short Street': 'StonewaterSurveys_2/138-1-3 Short Street-GU11 1HX', + '16, Copthorn House, Brighton Road': 'StonewaterSurveys_13/78-3-16 Brighton Road-KT20 6BQ', + '20 Nelson House, Short Street': 'StonewaterSurveys_15/89-1-20 Short Street-GU11 1HX', + '7 Croft Street': 'StonewaterSurveys_8/333-2-7 Croft Street-HR6 8LA' + } + + # We now match this retrofit packages board to the extracted data + matching_lookup = [] + for _, home in tqdm(retrofit_packages_board.iterrows(), total=len(retrofit_packages_board)): + + # Handle the case that has the wrong postcode in the asset data + if home["Name"] in manual_filters: + filtered = retrofit_assessment_data[ + retrofit_assessment_data["survey_folder"] == manual_filters[home["Name"]] + ].copy() + else: + filtered = retrofit_assessment_data[ + retrofit_assessment_data["Postcode"].str.lower() == home["Postcode"].lower() + ].copy() + + # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces + to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains( + home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False + ) + if to_filter.sum() == 0: + to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.replace(",", "").str.replace(".", + "").str.contains( + home["Name"].replace(r"[^\w\s]", "").replace(",", ""), case=False + ) + filtered = filtered[to_filter] + + if filtered.empty: + continue + + if filtered.shape[0] == 1: + matching_lookup.append( + { + "survey_folder": filtered["survey_folder"].values[0], + "Address ID": home["Address ID"], + "Name": home["Name"] + } + ) + continue + + # home["Name"] should be contained in the survey_folder + filtered = filtered[filtered["survey_folder"].str.contains(home["Name"], case=False)] + # We have an edge case wher some properties have two outputs in Sharepoint + if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ": + raise Exception("Fix me1") + # filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"] + + if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB': + raise Exception("Fix me2") + # filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"] + + if home["Name"] == '2 Bromyard Road' and home["Postcode"] == 'WR15 8BZ': + filtered = filtered[filtered["survey_folder"] == "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ"] + + if filtered.empty: + continue + if filtered.shape[0] != 1: + raise Exception("something went wrong") + + matching_lookup.append( + { + "survey_folder": filtered["survey_folder"].values[0], + "Address ID": home["Address ID"], + "Name": home["Name"] + } + ) + matching_lookup = pd.DataFrame(matching_lookup) + + ccs_coordination = ccs_coordination.rename( + columns={"Post Code": "Postcode"} + ) + ccs_coordination = ccs_coordination[~pd.isnull(ccs_coordination["Postcode"])] + from fuzzywuzzy import fuzz + + ccs_manual_filters = {} + ccs_matching_lookup = [] + for _, home in tqdm(ccs_coordination.iterrows(), total=len(ccs_coordination)): + # Handle the case that has the wrong postcode in the asset data + if home["Name"] in manual_filters: + filtered = retrofit_assessment_data[ + retrofit_assessment_data["survey_folder"] == manual_filters[home["Name"]] + ].copy() + else: + filtered = retrofit_assessment_data[ + retrofit_assessment_data["Postcode"].str.lower() == home["Postcode"].lower() + ].copy() + + # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces + to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains( + home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False + ) + if to_filter.sum() == 0: + to_filter = ( + filtered["survey_folder"]. + str.replace(r"[^\w\s]", ""). + str.replace(",", ""). + str.replace(".", ""). + str.contains( + home["Name"].replace(r"[^\w\s]", "").replace(",", ""), case=False + ) + ) + if to_filter.sum() == 0: + to_filter = filtered["Address"].str.split(",").str[0:2].str.join("") == home["Name"] + + if to_filter.sum() == 0: + # Do a fuzzy match on the name + # Find the best filter + to_filter = filtered["Address"].str.split(",").str[0:2].str.join("").apply( + lambda x: fuzz.partial_ratio(home["Name"], x) > 9 + ) + + if to_filter.sum() == 0: + blah + filtered = filtered[to_filter] + + if filtered.empty: + continue + + if filtered.shape[0] == 1: + ccs_matching_lookup.append( + { + "survey_folder": filtered["survey_folder"].values[0], + "Asset ID.1": home["Asset ID.1"], + "Name": home["Name"] + } + ) + continue + + blah2 + + # home["Name"] should be contained in the survey_folder + # filtered = filtered[filtered["survey_folder"].str.contains(home["Name"], case=False)] + # # We have an edge case wher some properties have two outputs in Sharepoint + # if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ": + # raise Exception("Fix me1") + # # filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"] + # + # if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB': + # raise Exception("Fix me2") + # # filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"] + # + # if home["Name"] == '2 Bromyard Road' and home["Postcode"] == 'WR15 8BZ': + # filtered = filtered[filtered["survey_folder"] == "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ"] + # + # if filtered.empty: + # continue + # if filtered.shape[0] != 1: + # raise Exception("something went wrong") + # + # matching_lookup.append( + # { + # "survey_folder": filtered["survey_folder"].values[0], + # "Address ID": home["Address ID"], + # "Name": home["Name"] + # } + # ) # if __name__ == "__main__": # main() From 7dd64781724df896badfd2170cba3ba5d2c283b9 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 29 Jan 2025 20:43:56 +0000 Subject: [PATCH 06/72] Added more logic for matching --- etl/customers/stonewater/Wave 3 Preparation.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 904afd30..ab640496 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3273,7 +3273,7 @@ def revised_model(): ccs_matching_lookup = [] for _, home in tqdm(ccs_coordination.iterrows(), total=len(ccs_coordination)): # Handle the case that has the wrong postcode in the asset data - if home["Name"] in manual_filters: + if home["Name"] in ccs_manual_filters: filtered = retrofit_assessment_data[ retrofit_assessment_data["survey_folder"] == manual_filters[home["Name"]] ].copy() @@ -3297,13 +3297,16 @@ def revised_model(): ) ) if to_filter.sum() == 0: - to_filter = filtered["Address"].str.split(",").str[0:2].str.join("") == home["Name"] - + to_filter = filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("") == home[ + "Name"] + if to_filter.sum() == 0: + to_filter = filtered["Address"].str.replace(" ,", "").str.split(",").str[0:1].str.join("") == home[ + "Name"] if to_filter.sum() == 0: # Do a fuzzy match on the name # Find the best filter - to_filter = filtered["Address"].str.split(",").str[0:2].str.join("").apply( - lambda x: fuzz.partial_ratio(home["Name"], x) > 9 + to_filter = filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("").apply( + lambda x: fuzz.partial_ratio(home["Name"], x) > 93 ) if to_filter.sum() == 0: From 0331d82f6ac687b55297e80f430a15fa148f5d67 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 29 Jan 2025 20:55:36 +0000 Subject: [PATCH 07/72] added manual match --- .../stonewater/Wave 3 Preparation.py | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index ab640496..61344038 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3267,15 +3267,19 @@ def revised_model(): columns={"Post Code": "Postcode"} ) ccs_coordination = ccs_coordination[~pd.isnull(ccs_coordination["Postcode"])] + ccs_coordination = ccs_coordination[ccs_coordination["Retrofit Assessment"] != "Outstanding"] from fuzzywuzzy import fuzz - ccs_manual_filters = {} + ccs_manual_filters = { + "35 Kittiwake Close": "Wave 2.1 Surveys/11. CCS Dorset/Kittiwake Close 35" + } ccs_matching_lookup = [] for _, home in tqdm(ccs_coordination.iterrows(), total=len(ccs_coordination)): + # Handle the case that has the wrong postcode in the asset data if home["Name"] in ccs_manual_filters: filtered = retrofit_assessment_data[ - retrofit_assessment_data["survey_folder"] == manual_filters[home["Name"]] + retrofit_assessment_data["survey_folder"] == ccs_manual_filters[home["Name"]] ].copy() else: filtered = retrofit_assessment_data[ @@ -3297,11 +3301,15 @@ def revised_model(): ) ) if to_filter.sum() == 0: - to_filter = filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("") == home[ - "Name"] + to_filter = ( + filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("").str.lower() == + home["Name"].lower() + ) if to_filter.sum() == 0: - to_filter = filtered["Address"].str.replace(" ,", "").str.split(",").str[0:1].str.join("") == home[ - "Name"] + to_filter = ( + filtered["Address"].str.replace(" ,", "").str.split(",").str[0:1].str.join("").str.lower() == + home["Name"].lower() + ) if to_filter.sum() == 0: # Do a fuzzy match on the name # Find the best filter From 678a4b52d28194d1dcf7c2d86d3993dde0161f3f Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 29 Jan 2025 21:03:11 +0000 Subject: [PATCH 08/72] matching for all of ccs --- etl/customers/stonewater/Wave 3 Preparation.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 61344038..fa548f0d 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3316,6 +3316,19 @@ def revised_model(): to_filter = filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("").apply( lambda x: fuzz.partial_ratio(home["Name"], x) > 93 ) + if to_filter.sum() == 0: + # We also some cases where the name of the survey folder is like "Colville Road 7" and the + # property name is actually 7 Colville Road, so we try taking the final part of the address, + # splitting on space, and adding it to the front + def reformat_survey_folder(x): + filename = x.split("/")[-1] + parts = filename.split(" ") + return " ".join(parts[-1:] + parts[:-1]) + + to_filter = ( + filtered["survey_folder"].apply(lambda x: reformat_survey_folder(x)).str.lower() == + home["Name"].lower() + ) if to_filter.sum() == 0: blah From 7291f7128e6b5403132e5afdcc56330ea3d71f15 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 29 Jan 2025 21:11:29 +0000 Subject: [PATCH 09/72] started wates matching --- .../stonewater/Wave 3 Preparation.py | 119 +++++++++++++----- 1 file changed, 91 insertions(+), 28 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index fa548f0d..cbbf04c6 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3331,7 +3331,7 @@ def revised_model(): ) if to_filter.sum() == 0: - blah + raise Exception("Error") filtered = filtered[to_filter] if filtered.empty: @@ -3347,34 +3347,97 @@ def revised_model(): ) continue - blah2 + raise Exception("No match") - # home["Name"] should be contained in the survey_folder - # filtered = filtered[filtered["survey_folder"].str.contains(home["Name"], case=False)] - # # We have an edge case wher some properties have two outputs in Sharepoint - # if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ": - # raise Exception("Fix me1") - # # filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"] - # - # if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB': - # raise Exception("Fix me2") - # # filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"] - # - # if home["Name"] == '2 Bromyard Road' and home["Postcode"] == 'WR15 8BZ': - # filtered = filtered[filtered["survey_folder"] == "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ"] - # - # if filtered.empty: - # continue - # if filtered.shape[0] != 1: - # raise Exception("something went wrong") - # - # matching_lookup.append( - # { - # "survey_folder": filtered["survey_folder"].values[0], - # "Address ID": home["Address ID"], - # "Name": home["Name"] - # } - # ) + ccs_matching_lookup = pd.DataFrame(ccs_matching_lookup) + # We get a match for all records + assert ccs_matching_lookup.shape[0] == ccs_coordination.shape[0] + assert not pd.isnull(ccs_matching_lookup["Asset ID.1"]).sum() + + # We do the same for Wates + wates_coordination = wates_coordination.rename( + columns={"Post Code": "Postcode"} + ) + wates_coordination = wates_coordination[ + wates_coordination["Retrofit Assessment"].isin(["Completed"]) + ] + + wates_manual_filters = {} + wates_matching_lookup = [] + for _, home in tqdm(wates_coordination.iterrows(), total=len(wates_coordination)): + + # Handle the case that has the wrong postcode in the asset data + if home["Name"] in wates_manual_filters: + filtered = retrofit_assessment_data[ + retrofit_assessment_data["survey_folder"] == wates_manual_filters[home["Name"]] + ].copy() + else: + filtered = retrofit_assessment_data[ + retrofit_assessment_data["Postcode"].str.lower() == home["Postcode"].lower() + ].copy() + + # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces + to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains( + home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False + ) + if to_filter.sum() == 0: + to_filter = ( + filtered["survey_folder"]. + str.replace(r"[^\w\s]", ""). + str.replace(",", ""). + str.replace(".", ""). + str.contains( + home["Name"].replace(r"[^\w\s]", "").replace(",", ""), case=False + ) + ) + if to_filter.sum() == 0: + to_filter = ( + filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("").str.lower() == + home["Name"].lower() + ) + if to_filter.sum() == 0: + to_filter = ( + filtered["Address"].str.replace(" ,", "").str.split(",").str[0:1].str.join("").str.lower() == + home["Name"].lower() + ) + if to_filter.sum() == 0: + # Do a fuzzy match on the name + # Find the best filter + to_filter = filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("").apply( + lambda x: fuzz.partial_ratio(home["Name"], x) > 93 + ) + if to_filter.sum() == 0: + # We also some cases where the name of the survey folder is like "Colville Road 7" and the + # property name is actually 7 Colville Road, so we try taking the final part of the address, + # splitting on space, and adding it to the front + def reformat_survey_folder(x): + filename = x.split("/")[-1] + parts = filename.split(" ") + return " ".join(parts[-1:] + parts[:-1]) + + to_filter = ( + filtered["survey_folder"].apply(lambda x: reformat_survey_folder(x)).str.lower() == + home["Name"].lower() + ) + + if to_filter.sum() == 0: + raise Exception("Error") + filtered = filtered[to_filter] + + if filtered.empty: + continue + + if filtered.shape[0] == 1: + wates_matching_lookup.append( + { + "survey_folder": filtered["survey_folder"].values[0], + "Asset ID": home["Asset ID"], + "Name": home["Name"] + } + ) + continue + + raise Exception("No match") # if __name__ == "__main__": # main() From b1936521f6f3c3585057d5f2ce10d1998e558400 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 29 Jan 2025 21:16:18 +0000 Subject: [PATCH 10/72] added manual match --- etl/customers/stonewater/Wave 3 Preparation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index cbbf04c6..8a00604b 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3362,7 +3362,9 @@ def revised_model(): wates_coordination["Retrofit Assessment"].isin(["Completed"]) ] - wates_manual_filters = {} + wates_manual_filters = { + "24 Rabley Wood View": "Wave 2.1 Surveys/3. Wiltshire/24-25 Rabley Wood View" + } wates_matching_lookup = [] for _, home in tqdm(wates_coordination.iterrows(), total=len(wates_coordination)): From 1814d7b6709cd7861db5c15ac6821a601708882e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 29 Jan 2025 21:21:08 +0000 Subject: [PATCH 11/72] 11% through matching --- etl/customers/stonewater/Wave 3 Preparation.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 8a00604b..7cbf04f1 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3382,6 +3382,13 @@ def revised_model(): to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains( home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False ) + + if to_filter.sum() > 1: + to_filter = ( + filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.split("/").str[-1].str.lower() == + home["Name"].replace(r"[^\w\s]", "").lstrip().lower() + ) + if to_filter.sum() == 0: to_filter = ( filtered["survey_folder"]. From b4296db52d7b3c3e26ce3869ac31753bd731c379 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 30 Jan 2025 00:51:39 +0000 Subject: [PATCH 12/72] adding quidos extraction functions --- .../stonewater/Wave 3 Preparation.py | 7 ++ survey_report/app.py | 44 +++++++++ .../extraction/detect_report_type.py | 19 ++++ survey_report/extraction/quidos.py | 99 +++++++++++++++++++ survey_report/requirements.txt | 0 5 files changed, 169 insertions(+) create mode 100644 survey_report/app.py create mode 100644 survey_report/extraction/detect_report_type.py create mode 100644 survey_report/extraction/quidos.py create mode 100644 survey_report/requirements.txt diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 7cbf04f1..70c531c0 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3366,8 +3366,15 @@ def revised_model(): "24 Rabley Wood View": "Wave 2.1 Surveys/3. Wiltshire/24-25 Rabley Wood View" } wates_matching_lookup = [] + # Examples to skip when we cannot get the data + wates_to_skip = [ + "66 Abbatt Close", # File type is unusual, couldn't extract the data + ] for _, home in tqdm(wates_coordination.iterrows(), total=len(wates_coordination)): + if home["Name"] in wates_to_skip: + continue + # Handle the case that has the wrong postcode in the asset data if home["Name"] in wates_manual_filters: filtered = retrofit_assessment_data[ diff --git a/survey_report/app.py b/survey_report/app.py new file mode 100644 index 00000000..825a3658 --- /dev/null +++ b/survey_report/app.py @@ -0,0 +1,44 @@ +import os +import PyPDF2 +from survey_report.extraction.detect_report_type import detect_report_type +from survey_report.extraction.quidos import SiteNotesExtractor + + +def handle(): + """ + Performs the data extraction process for the survey report + :return: + """ + + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2" + + folder_contents = os.listdir(data_folder) + # We look for the following files: + # Site notes + file_mapping = {} + for file in folder_contents: + # Check if it's a pdf file + if not file.endswith(".pdf"): + continue + filepath = os.path.join(data_folder, file) + with (open(filepath, "rb") as f): + pdf = PyPDF2.PdfReader(f) + first_page = pdf.pages[0].extract_text() + text = "" + for page in pdf.pages: + text += page.extract_text() + + # Check the report type + report_type = detect_report_type(first_page) + if report_type is not None: + file_mapping[report_type] = text + + # Check the report type + report_type = detect_report_type(os.path.join(data_folder, file)) + + # This is only set up to work with quido site notes so we must have it + if "quidos_site_notes" not in file_mapping: + raise ValueError("No quidos site notes found") + + site_notes_extractor = SiteNotesExtractor(file_mapping["quidos_site_notes"]) + site_notes = site_notes_extractor.extract_all() diff --git a/survey_report/extraction/detect_report_type.py b/survey_report/extraction/detect_report_type.py new file mode 100644 index 00000000..fe1600e7 --- /dev/null +++ b/survey_report/extraction/detect_report_type.py @@ -0,0 +1,19 @@ +import re + + +def detect_report_type(first_page): + """ + Detects the type of report based on the first page of the report + :param first_page: + :return: + """ + # Set up for the minute to handle quidos files. We have the Elmhurst logic so we can introduce + # this when we need + + if re.match( + r"^Created \d{2}/\d{2}/\d{4} for Quidos Ltd using Argyle software BRE approved calculator", + first_page + ): + return "quidos_site_notes" + + return None diff --git a/survey_report/extraction/quidos.py b/survey_report/extraction/quidos.py new file mode 100644 index 00000000..f11ffcb1 --- /dev/null +++ b/survey_report/extraction/quidos.py @@ -0,0 +1,99 @@ +import re + + +class SiteNotesExtractor: + """ + Extracts SAP rating, carbon emissions, and building dimensions from an EPC summary report. + """ + + def __init__(self, pdf_text): + """ + Initializes the SiteNotesExtractor with the extracted PDF text. + """ + self.text = pdf_text + self.data = {} + + def extract_sap_rating(self): + """ + Extracts the current and potential SAP rating from the report. + """ + pattern = re.search(r"Current SAP rating\s*([A-G])\s*(\d+)\s*Potential SAP rating\s*([A-G])\s*(\d+)", self.text) + + if not pattern: + raise ValueError("No SAP rating found in the report") + + self.data.update({ + "Current EPC Band": pattern.group(1), + "Current SAP Rating": int(pattern.group(2)), + "Potential EPC Band": pattern.group(3), + "Potential SAP Rating": int(pattern.group(4)), + }) + + def extract_carbon_emissions(self): + """ + Extracts the current and adjusted annual carbon emissions (TCO2). + """ + pattern = re.search(r"Current annual emissions\s*([\d.]+)\s*\(TCO2\)", self.text) + + if not pattern: + raise ValueError("No carbon emissions found in the report") + + self.data.update({ + "Current Carbon Emissions (TCO2)": float(pattern.group(1)), + }) + + def extract_building_dimensions(self): + """ + Extracts dimensions for each building part and stores them in a list. + Handles Main Property and multiple extensions. + """ + + # Locate the Dimensions section + dimensions_section = re.search( + r"Dimension Type (?:internal|external)\nPart Floor Area \(m2\) Room Height \(m\) Loss Perimeter \(m\) " + r"Party Wall " + r"Length \(m\)\n" + r"(.*?)\n5\.0 Conservatory", self.text, re.DOTALL + ) + + if not dimensions_section: + raise ValueError("Failed to locate the dimensions section in the text.") + + dimensions_text = dimensions_section.group(1) + + # Pattern to match each building part (Main Property, Extension 1, Extension 2, etc.) + building_part_pattern = re.compile( + r"(Main Property|Extension \d+)\s*(?:Property)?\s*([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" + ) + + building_parts = [] + for match in building_part_pattern.finditer(dimensions_text): + to_append = { + "Building Part": match.group(1).strip(), + "Part Floor Area (m2)": float(match.group(2)), + "Room Height (m)": float(match.group(3)), + "Loss Perimeter (m)": float(match.group(4)), + "Party Wall Length (m)": float(match.group(5)), + } + # We calculate the heat loss area + to_append["Heat Loss Area (m2)"] = to_append["Loss Perimeter (m)"] * to_append["Room Height (m)"] + building_parts.append(to_append) + + if not building_parts: + raise ValueError("No building dimensions found in the report") + + self.data["Building Dimensions"] = building_parts + # We calculate some totals + self.data["Total Building Dimensions"] = { + "floor_area": sum([part["Part Floor Area (m2)"] for part in building_parts]), + "heat_loss_area": sum([part["Heat Loss Area (m2)"] for part in building_parts]), + } + + def extract_all(self): + """ + Runs all extraction methods and returns a dictionary with extracted data. + """ + self.extract_sap_rating() + self.extract_carbon_emissions() + self.extract_building_dimensions() + return self.data diff --git a/survey_report/requirements.txt b/survey_report/requirements.txt new file mode 100644 index 00000000..e69de29b From 32b053e7db3b08445b1649d6c418f33c5b235647 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 30 Jan 2025 00:54:56 +0000 Subject: [PATCH 13/72] extracting bills --- survey_report/extraction/quidos.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/survey_report/extraction/quidos.py b/survey_report/extraction/quidos.py index f11ffcb1..ae66dd0d 100644 --- a/survey_report/extraction/quidos.py +++ b/survey_report/extraction/quidos.py @@ -89,11 +89,23 @@ class SiteNotesExtractor: "heat_loss_area": sum([part["Heat Loss Area (m2)"] for part in building_parts]), } + def extract_bills_estimate(self): + """ + Extracts the estimated annual energy costs (£) from the report. + """ + pattern = re.search(r"Current annual energy costs £\s*([\d,.]+)", self.text) + + if not pattern: + raise ValueError("No bills estimate found in the report") + + self.data["Estimated Annual Energy Cost (£)"] = float(pattern.group(1).replace(",", "")) + def extract_all(self): """ Runs all extraction methods and returns a dictionary with extracted data. """ self.extract_sap_rating() self.extract_carbon_emissions() + self.extract_bills_estimate() self.extract_building_dimensions() return self.data From daabf2a586eec7bf31440696f014ad7035a0033e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 30 Jan 2025 01:09:41 +0000 Subject: [PATCH 14/72] extracting epr --- survey_report/app.py | 20 ++++--- .../extraction/detect_report_type.py | 3 + survey_report/extraction/quidos.py | 55 +++++++++++++++++++ 3 files changed, 71 insertions(+), 7 deletions(-) diff --git a/survey_report/app.py b/survey_report/app.py index 825a3658..f59c9984 100644 --- a/survey_report/app.py +++ b/survey_report/app.py @@ -1,7 +1,7 @@ import os import PyPDF2 from survey_report.extraction.detect_report_type import detect_report_type -from survey_report.extraction.quidos import SiteNotesExtractor +from survey_report.extraction.quidos import SiteNotesExtractor, EPRExtractor def handle(): @@ -33,12 +33,18 @@ def handle(): if report_type is not None: file_mapping[report_type] = text - # Check the report type - report_type = detect_report_type(os.path.join(data_folder, file)) - # This is only set up to work with quido site notes so we must have it - if "quidos_site_notes" not in file_mapping: - raise ValueError("No quidos site notes found") - site_notes_extractor = SiteNotesExtractor(file_mapping["quidos_site_notes"]) site_notes = site_notes_extractor.extract_all() + + # We also must have an EPR + epr_extractor = EPRExtractor(file_mapping["quidos_epr"]) + epr = epr_extractor.extract_all() + + # We now produce the combined data sheet which is the starting figure: + data_sheet = {**epr, **site_notes} + del data_sheet['Building Dimensions'] + # We unnest the Total Building Dimensions + data_sheet["Total Building Floor Area (m2)"] = data_sheet["Total Building Dimensions"]["floor_area"] + data_sheet["Total Building Heat Loss Area (m2)"] = data_sheet["Total Building Dimensions"]["heat_loss_area"] + del data_sheet["Total Building Dimensions"] diff --git a/survey_report/extraction/detect_report_type.py b/survey_report/extraction/detect_report_type.py index fe1600e7..434a3fb4 100644 --- a/survey_report/extraction/detect_report_type.py +++ b/survey_report/extraction/detect_report_type.py @@ -16,4 +16,7 @@ def detect_report_type(first_page): ): return "quidos_site_notes" + if re.search(r"\nIQ-Energy\nEnergy Performance Report\nPage 1 of 1", first_page): + return "quidos_epr" + return None diff --git a/survey_report/extraction/quidos.py b/survey_report/extraction/quidos.py index ae66dd0d..374df084 100644 --- a/survey_report/extraction/quidos.py +++ b/survey_report/extraction/quidos.py @@ -109,3 +109,58 @@ class SiteNotesExtractor: self.extract_bills_estimate() self.extract_building_dimensions() return self.data + + +class EPRExtractor: + """ + Extracts space heating, water heating, and address from an Energy Performance Report (EPR). + """ + + def __init__(self, pdf_text): + """ + Initializes the EPRExtractor with the extracted PDF text. + """ + self.text = pdf_text + self.data = {} + + def extract_heating_data(self): + """ + Extracts space heating and water heating values from the report. + """ + pattern = re.search( + r"Space Heating\(KWH\)\s*([\d,]+).*?\nWater Heating\(KWH\)\s*([\d,]+)", + self.text, + re.DOTALL + ) + + if not pattern: + raise ValueError("No heating data found in the report") + + self.data.update({ + "Space Heating (KWH)": int(pattern.group(1).replace(",", "")), + "Water Heating (KWH)": int(pattern.group(2).replace(",", "")) + }) + + def extract_address(self): + """ + Extracts the full address from the report. + """ + pattern = re.search( + r"Address\s*(.*?)\nTown\s*(.*?)\n", + self.text, + re.DOTALL + ) + + if not pattern: + raise ValueError("No address found in the report") + + full_address = pattern.group(1).strip() + self.data["Address"] = full_address + + def extract_all(self): + """ + Runs all extraction methods and returns a dictionary with extracted data. + """ + self.extract_address() + self.extract_heating_data() + return self.data From f6d8688698bfcdc1c9d1230b9040dfe071e2bf1e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 30 Jan 2025 17:30:17 +0000 Subject: [PATCH 15/72] completed matching --- .../stonewater/Wave 3 Preparation.py | 89 +++++++++++-- etl/customers/stonewater/data_cleaning.py | 5 +- survey_report/app.py | 41 ++++++ survey_report/template.html | 123 ++++++++++++++++++ 4 files changed, 248 insertions(+), 10 deletions(-) create mode 100644 survey_report/template.html diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 70c531c0..d9b5c41d 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3078,6 +3078,13 @@ def revised_model(): retrofit_assessment_data = pd.DataFrame(extracted_data) + # retrofit_assessment_data.to_csv( + # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 2.csv"), index=False + # ) + retrofit_assessment_data = pd.read_csv( + os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 2.csv"), + ) + # Remove some definite duplicates dupes = retrofit_assessment_data[retrofit_assessment_data["Address"].duplicated()]["Address"] dupes = retrofit_assessment_data[retrofit_assessment_data["Address"].isin(dupes)] @@ -3097,10 +3104,6 @@ def revised_model(): # Replace \n with "" retrofit_assessment_data["Postcode"] = retrofit_assessment_data["Postcode"].str.replace("\n", "") - # retrofit_assessment_data.to_csv( - # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet.csv"), index=False - # ) - # We can read in the data as needed # Next Step: Read in the coordinated measures and match to the extracted data @@ -3108,24 +3111,59 @@ def revised_model(): # CCS ############################################################# ccs_coordination_sheet = pd.read_excel( - os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "CCS_Installation_Compliance_CCS.xlsx"), + os.path.join( + CUSTOMER_FOLDER_PATH, + "Jan 2025 Project", + "CCS_Installation_Compliance_-_Stonewater_SHDF_2_1_1738228227.xlsx" + ), header=4 ) + ccs_postcodes = pd.read_excel( + os.path.join( + CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "CCS_Installation_Compliance_CCS.xlsx" + ), + header=4 + ) + ccs_coordination_sheet = ccs_postcodes[['Name', 'Post Code', 'Asset ID', 'Asset ID.1']].merge( + ccs_coordination_sheet, how="left", on="Name" + ) + ccs_coordination_sheet = ccs_coordination_sheet[~pd.isnull(ccs_coordination_sheet["Name"])] ccs_coordination_sheet["contractor"] = "CCS" # We split ccs into two sections - the first being ccs_coordination_removed_from_programme = ccs_coordination_sheet.tail(21) ccs_coordination_sheet = ccs_coordination_sheet.head(87) ccs_coordination = pd.concat([ccs_coordination_removed_from_programme, ccs_coordination_sheet]) + from urllib import parse + def extract_sharepoint_url(x): + if pd.isnull(x): + return "" + return "/".join(parse.urlparse( + x.split(" - http")[1] + ).path.replace("%20", " ").split("/")[-2:]) + + ccs_coordination["folder_path"] = ccs_coordination["Sharepoint Link"].apply(lambda x: extract_sharepoint_url(x)) + ############################################################ # WATES ############################################################# wates_coordination_sheet = pd.read_excel( + os.path.join( + CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "Stonewater_SAP_Installation_Compliance_1738229226.xlsx" + ), + header=4 + ) + wates_postcodes = pd.read_excel( os.path.join( CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "Stonewater_SAP_Installation_Compliance_Vinci-Wates.xlsx" ), header=4 ) + wates_postcodes = wates_postcodes[~pd.isnull(wates_postcodes["Post Code"])] + wates_coordination_sheet = wates_coordination_sheet.merge( + wates_postcodes[['Name', 'Post Code', 'Asset ID']].drop_duplicates(), how="left", on="Name" + ) + wates_coordination_sheet["contractor"] = "Wates" # Break into the different sites: # Wiltshire @@ -3136,7 +3174,7 @@ def revised_model(): wates_coordination_sheet_bournemouth = wates_coordination_sheet.iloc[524:567, :] wates_coordination_sheet_cambridgeshire = wates_coordination_sheet.iloc[571:581, :] wates_coordination_sheet_removed_from_programme = wates_coordination_sheet.iloc[586:926, :] - wates_coordination_sheet_abeyance = wates_coordination_sheet.iloc[928:972, :] + wates_coordination_sheet_abeyance = wates_coordination_sheet.iloc[930:972, :] wates_coordination = pd.concat( [ @@ -3151,12 +3189,15 @@ def revised_model(): ] ) + wates_coordination["folder_path"] = wates_coordination["Sharepoint Folder"].apply( + lambda x: extract_sharepoint_url(x) + ) + # Combine the data back ############################################################ # NEW 450 COORDINATED RETROFIT ASSESSMENTS ############################################################# - retrofit_packages_board = pd.read_excel( os.path.join( CUSTOMER_FOLDER_PATH, @@ -3361,17 +3402,49 @@ def revised_model(): wates_coordination = wates_coordination[ wates_coordination["Retrofit Assessment"].isin(["Completed"]) ] + wates_coordination = wates_coordination[ + ~pd.isnull(wates_coordination["Postcode"]) + ] wates_manual_filters = { - "24 Rabley Wood View": "Wave 2.1 Surveys/3. Wiltshire/24-25 Rabley Wood View" + "24 Rabley Wood View": "Wave 2.1 Surveys/3. Wiltshire/24-25 Rabley Wood View", + "14 Edencroft": "Wave 2.1 Surveys/3. Wiltshire/14 Edencroft", + "Flat 31 Rabley Wood View": "Wave 2.1 Surveys/3. Wiltshire/Flat 31 Rabley Wood View", + 'Flat 13, Manor Fields': 'Wave 2.1 Surveys/1. Herefordshire/(038) Manor Fields Flat 13', + "4 Kittys Lane": "Wave 2.1 Surveys/1. Herefordshire/(005) Kittys Lane 4", + '1 Jephson Court': 'Wave 2.1 Surveys/5. Coventry/Jesphson Court 1', + '2 Jephson Court': 'Wave 2.1 Surveys/5. Coventry/Jesphson Court 2', } wates_matching_lookup = [] # Examples to skip when we cannot get the data wates_to_skip = [ "66 Abbatt Close", # File type is unusual, couldn't extract the data + "Flat 69 Goddard Road", # Doesn't exist + "19 Garth House", # # File type is unusual, couldn't extract the data + '5 Gilpin Close', # No properly formatted EPR + '49 The Hide, Netherfield', # TODO: TEMP HERE + '19 Chanders Rd', + '5 Chanders Rd', + '23 Chanders Rd', + '3 Chanders Rd', + '1 Orchard Close', ] for _, home in tqdm(wates_coordination.iterrows(), total=len(wates_coordination)): + # Search the folder + filtered = retrofit_assessment_data[ + retrofit_assessment_data["survey_folder"].str.contains(home["folder_path"], regex=False) + ] + if len(filtered) == 1: + wates_matching_lookup.append( + { + "survey_folder": filtered["survey_folder"].values[0], + "Asset ID": home["Asset ID"], + "Name": home["Name"] + } + ) + continue + if home["Name"] in wates_to_skip: continue diff --git a/etl/customers/stonewater/data_cleaning.py b/etl/customers/stonewater/data_cleaning.py index 7ee06fcd..010902ce 100644 --- a/etl/customers/stonewater/data_cleaning.py +++ b/etl/customers/stonewater/data_cleaning.py @@ -86,7 +86,6 @@ def download_data_from_sharepoint(): folder_path="Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders" ) - len(contents["value"]) folders_to_pull = [ folder for folder in contents["value"] if folder["name"] in ["3. Wiltshire", "4. Bournemouth", "5. Coventry"] ] @@ -108,6 +107,8 @@ def download_data_from_sharepoint(): folder_path="Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders" + "/" + folder_to_pull["name"] + "/" + property_folder["name"] ) + if not property_folder_contents.get("value"): + continue # We look for the retrofit assessment folder: property_sub_folders = [ f for f in property_folder_contents["value"] if "ra coordinator info" in f["name"].lower() @@ -138,5 +139,5 @@ def download_data_from_sharepoint(): drive_id=sharepoint_client.document_drive["id"], folder_path=property_folder_path, download_dir=download_dir, - excluded_file_types=["MOV"] + excluded_file_types=["MOV", "jpg"] ) diff --git a/survey_report/app.py b/survey_report/app.py index f59c9984..87ce7864 100644 --- a/survey_report/app.py +++ b/survey_report/app.py @@ -1,9 +1,33 @@ import os import PyPDF2 +from string import Template from survey_report.extraction.detect_report_type import detect_report_type from survey_report.extraction.quidos import SiteNotesExtractor, EPRExtractor +def generate_html_report(template_path, output_path, data): + """ + Reads an HTML template file, injects dynamic values, and generates a final HTML report. + + Args: + - template_path (str): Path to the HTML template file. + - output_path (str): Path to save the generated HTML file. + - data (dict): Dictionary containing dynamic values for the report. + """ + # Read the template file + with open(template_path, "r", encoding="utf-8") as f: + html_template = Template(f.read()) # Use Template from string module + + # Replace placeholders with actual data + final_html = html_template.safe_substitute(data) # Use safe_substitute to prevent missing key errors + + # Save the generated HTML file + with open(output_path, "w", encoding="utf-8") as f: + f.write(final_html) + + print(f"HTML report generated successfully: {output_path}") + + def handle(): """ Performs the data extraction process for the survey report @@ -48,3 +72,20 @@ def handle(): data_sheet["Total Building Floor Area (m2)"] = data_sheet["Total Building Dimensions"]["floor_area"] data_sheet["Total Building Heat Loss Area (m2)"] = data_sheet["Total Building Dimensions"]["heat_loss_area"] del data_sheet["Total Building Dimensions"] + + # Generate the HTML report + # Placeholder locations + template_path = "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/template.html" + output_path = "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/output/report.html" + logo_path = "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/assets/logo.png" + generate_html_report( + template_path, output_path, + data={ + "address": data_sheet["Address"], + "logo_path": logo_path, + "current_epc": data_sheet["Current EPC Band"], + "current_sap": data_sheet["Current SAP Rating"], + "potential_epc": "A", # TODO PLACEHOLDER + "potential_sap": 91, # TODO PLACEHOLDER + } + ) diff --git a/survey_report/template.html b/survey_report/template.html new file mode 100644 index 00000000..5d3b6c63 --- /dev/null +++ b/survey_report/template.html @@ -0,0 +1,123 @@ + + + + + + Domna Energy Report + + + + +
+ +
+
+

Domna Energy Report

+

${address}

+
+ +
+ + +
+
+
Current EPC Rating
+
${current_epc}
+
SAP ${current_sap}
+
+ +
+
Potential EPC Rating
+
${potential_epc}
+
SAP ${potential_sap}
+
+
+ +
+ + + From 01a5077c17cd219ddc907c48eaae4158c9117cfb Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 3 Feb 2025 12:54:57 +0000 Subject: [PATCH 16/72] tidying up stonewater work --- .../stonewater/Wave 3 Preparation.py | 224 +++++++++++++++++- 1 file changed, 221 insertions(+), 3 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index d9b5c41d..5c4da35b 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1,4 +1,6 @@ import os +from pyexpat import features + import PyPDF2 import re import pandas as pd @@ -1704,7 +1706,6 @@ def append_stonewater_id(): ) model_proposed_sample = model_proposed_sample[~pd.isnull(model_proposed_sample["Address ID"])] model_proposed_sample["Address ID"] = model_proposed_sample["Address ID"].astype(int) - z = model_proposed_sample["Archetype ID"].drop_duplicates().sort_values() original_archetypes = pd.read_excel( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 " @@ -2942,7 +2943,6 @@ def revised_model(): """ # 1) Create the new list of properties - new_priority_postcodes = pd.read_excel( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Jan 2025 Project/Updated 2025 to 2030 " "priority list.xlsx" @@ -3188,7 +3188,13 @@ def revised_model(): wates_coordination_sheet_abeyance ] ) - + # We correct the Asset ID for 34 Kempster Close + wates_coordination["Asset ID"] = np.where( + wates_coordination["Name"] == "34 Kempster Close", + "12005", + wates_coordination["Asset ID"] + ) + wates_coordination["folder_path"] = wates_coordination["Sharepoint Folder"].apply( lambda x: extract_sharepoint_url(x) ) @@ -3198,6 +3204,14 @@ def revised_model(): ############################################################ # NEW 450 COORDINATED RETROFIT ASSESSMENTS ############################################################# + features = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - " + "master sheet.csv", + encoding='latin1' + ) + features["Address ID"] = features["Address ID"].astype(str).astype(int) + features_to_merge = features[["Address ID", "Organisation Reference"]] + retrofit_packages_board = pd.read_excel( os.path.join( CUSTOMER_FOLDER_PATH, @@ -3211,6 +3225,10 @@ def revised_model(): retrofit_packages_board["RA"].isin(["Invoiced", "Completed"]) ] + retrofit_packages_board = retrofit_packages_board.merge( + features_to_merge, how="left", on="Address ID" + ) + manual_filters = { "Flat 21 Walmer Street": "StonewaterSurveys_14/91-1-Flat 21 Walmer Street-HR4 9JD", "6 Cornewall Close": "StonewaterSurveys_14/aa 6, Cornewall Close, Moccas, HEREFORD, HR2 9LG", @@ -3527,6 +3545,206 @@ def revised_model(): continue raise Exception("No match") + wates_matching_lookup = pd.DataFrame(wates_matching_lookup) + + # Merge lookup tables onto the coordination sheets + wates_coordination = wates_coordination.merge( + wates_matching_lookup, how="left", on="Name" + ) + missed_asset_id = wates_coordination[pd.isnull(wates_coordination["Asset ID_x"])] + if not missed_asset_id.empty: + # We fill the missing ids + missing_lookup = { + "4 Sydnall Fields": 31231, + "12 Sydnall Fields": 31239, + "12 Athena Gardens": 28061, + "49 Banner Lane": 41189, + "4 Jonathan Road": 41232, + "8 Jonathan Road": 41236, + "1 Jonathan Road": 41229, + "96 Taunton Way": 31417, + "94 Taunton Way": 31418, + "1 Lady Lane": 29430, + "10 Jonathan Road": 41283, + "21 Jonathan Road": 41246, + "12 Ashcroft Close": 26399 + } + for name, asset_id in missing_lookup.items(): + wates_coordination["Asset ID_x"] = np.where( + wates_coordination["Name"] == name, + asset_id, + wates_coordination["Asset ID_x"] + ) + + ccs_coordination = ccs_coordination.merge( + ccs_matching_lookup, how="left", on="Name" + ) + + retrofit_packages_board = retrofit_packages_board.merge( + matching_lookup, how="left", on="Name" + ) + + # We combine this into a singular board + coordinated_packages = pd.concat( + [ + retrofit_packages_board[ + [ + "Name", "Postcode", 'Actual SAP Band', 'Actual SAP Rating', + 'Modelled SAP Band', 'Modelled SAP Rating', 'Package Ref', + 'Main Wall Insulation', 'Secondary Wall Insulation', 'Loft insulation', + 'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade', + 'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls', + 'Solar PV', 'Other measures', 'Organisation Reference', + ] + ], + ccs_coordination[ + [ + # We don't have secondary wall insulation, Flat Roof, RIR, Heating Controls, + # Solar PV + "Name", "Postcode", 'SAP Band Pre', 'SAP Rating Pre', 'SAP Rating Install Package', + 'SAP Band Install Package', 'Package Approved (Client)', + 'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade', + 'Ventilation', 'Heating', 'Other Measures', "Asset ID.1_y", + ] + ].rename( + columns={ + "SAP Band Pre": "Actual SAP Band", + "SAP Rating Pre": "Actual SAP Rating", + 'SAP Rating Install Package': 'Modelled SAP Band', + 'SAP Band Install Package': 'Modelled SAP Rating', + 'Package Approved (Client)': 'Package Ref', + 'Wall Insulation': 'Main Wall Insulation', + 'Loft Insulation': 'Loft insulation', + 'Windows Upgrade': 'Window Upgrade', + 'Ext. Doors Upgrade': 'Door Upgrade', + 'Heating': 'Main Heating', + 'Other Measures': 'Other measures', + 'Asset ID.1_y': 'Organisation Reference', + } + ), + wates_coordination[ + [ + "Name", "Postcode", 'SAP Band Pre', 'SAP Rating Pre', 'SAP Rating Install Package', + 'SAP Band Install Package', 'Package Approved (Client)', + 'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade', + 'Ventilation', 'Heating', 'Other Measures', 'Asset ID_x' + + ] + ].rename( + columns={ + "SAP Band Pre": "Actual SAP Band", + "SAP Rating Pre": "Actual SAP Rating", + 'SAP Rating Install Package': 'Modelled SAP Band', + 'SAP Band Install Package': 'Modelled SAP Rating', + 'Package Approved (Client)': 'Package Ref', + 'Wall Insulation': 'Main Wall Insulation', + 'Loft Insulation': 'Loft insulation', + 'Windows Upgrade': 'Window Upgrade', + 'Ext. Doors Upgrade': 'Door Upgrade', + 'Heating': 'Main Heating', + 'Other Measures': 'Other measures', + 'Asset ID_x': 'Organisation Reference', + } + ) + ] + ) + + coordinated_packages["Organisation Reference"] = coordinated_packages["Organisation Reference"].astype(int) + + # Merge the property features on + coordinated_packages = coordinated_packages.merge( + features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type"]], + how="left", + on="Organisation Reference" + ) + + # We need the features pertaining to these priority postcodes + + def find_nearest_matching_property(coordinated_packages, home): + filter_levels = [ + ["Postcode", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], + ["Postal Region", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], + ["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], + ["Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], + ["Primary Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], + ["Primary Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], + ] + + for i, filters in enumerate(filter_levels): + match = coordinated_packages.copy() + + for col in filters: + match = match[match[col] == home[col]] + + if not match.empty: + return match + + return None # No match found + + coordinated_packages["Postal Region"] = coordinated_packages["Postcode"].str.split(" ").str[0].str.strip() + new_priority_postcodes["Postal Region"] = new_priority_postcodes["Postcode"].str.split(" ").str[0].str.strip() + + coordinated_packages["Roof Simple"] = coordinated_packages["Roofs"].str.split(":").str[0].str.strip() + new_priority_postcodes["Roof Simple"] = new_priority_postcodes["Roofs"].str.split(":").str[0].str.strip() + + coordinated_packages["Primary Property Type"] = coordinated_packages["Property Type"].str.split(":").str[0] + new_priority_postcodes["Primary Property Type"] = new_priority_postcodes["Property Type"].str.split(":").str[0] + + # For every property in the priority postcodes data, we look for a most appropriate matching property + no_match = [] + matches = [] + for _, home in tqdm(new_priority_postcodes.iterrows(), total=len(new_priority_postcodes)): + closest_match = find_nearest_matching_property(coordinated_packages, home) + if closest_match is None: + no_match.append(home["Organisation Reference"]) + continue + + to_extend = [ + { + "Organisation Reference": home["Organisation Reference"], + "Best Match Organisation Reference": m + } for m in closest_match["Organisation Reference"].values + ] + matches.extend(to_extend) + + no_match_summary = new_priority_postcodes[ + new_priority_postcodes["Organisation Reference"].isin( + no_match + ) + ].groupby(["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"])[ + "Organisation Reference"].count().reset_index() + + no_match_summary = no_match_summary.sort_values("Organisation Reference", ascending=False) + + # len(no_match) + # 8764, 5607 + # no_match_summary.shape + # (3953, 6), (2948, 6) + + # We match the properties to their closest match + + matches_df = pd.DataFrame(matches) + matches_df = matches_df.merge( + coordinated_packages[["Organisation Reference", "Actual SAP Band", "Actual SAP Rating"]], + left_on="Best Match Organisation Reference", right_on="Organisation Reference", + suffixes=("", " - Closest Match") + ) + # We want to aggregate the matches, when we have multiple + aggregated_matches_df = [] + for org_ref, mapped_matches in matches_df.groupby("Organisation Reference"): + if mapped_matches.shape[0] == 1: + mapped_matches["Number of matches"] = 1 + mapped_matches["Proportion"] + aggregated_matches_df.append(mapped_matches) + continue + + mapped_priority_list = new_priority_postcodes.merge( + matches_df, on="Organisation Reference", + ) + # We merge on the EPC ratings for the matched properties + mapped_priority_list = mapped_priority_list.merge( + + ) # if __name__ == "__main__": # main() From 04eba60961b0ea215701b2b35feaed74f9a5ef11 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 3 Feb 2025 13:04:10 +0000 Subject: [PATCH 17/72] fixing cleaning for stonewater --- .../stonewater/Wave 3 Preparation.py | 58 +++++++++++-------- 1 file changed, 35 insertions(+), 23 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 5c4da35b..04078e47 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3194,7 +3194,32 @@ def revised_model(): "12005", wates_coordination["Asset ID"] ) - + + # We fill the missing ids + missing_lookup = { + "4 Sydnall Fields": 31231, + "12 Sydnall Fields": 31239, + "12 Athena Gardens": 28061, + "49 Banner Lane": 41189, + "4 Jonathan Road": 41232, + "8 Jonathan Road": 41236, + "1 Jonathan Road": 41229, + "96 Taunton Way": 31417, + "94 Taunton Way": 31418, + "1 Lady Lane": 29430, + "10 Jonathan Road": 41283, + "21 Jonathan Road": 41246, + "12 Ashcroft Close": 26399 + } + for name, asset_id in missing_lookup.items(): + wates_coordination["Asset ID_x"] = np.where( + wates_coordination["Name"] == name, + asset_id, + wates_coordination["Asset ID_x"] + ) + + wates_coordination = wates_coordination[~pd.isnull(wates_coordination["Asset ID"])] + wates_coordination["folder_path"] = wates_coordination["Sharepoint Folder"].apply( lambda x: extract_sharepoint_url(x) ) @@ -3412,6 +3437,7 @@ def revised_model(): # We get a match for all records assert ccs_matching_lookup.shape[0] == ccs_coordination.shape[0] assert not pd.isnull(ccs_matching_lookup["Asset ID.1"]).sum() + assert not ccs_matching_lookup["Asset ID.1"].duplicated().sum() # We do the same for Wates wates_coordination = wates_coordination.rename( @@ -3447,6 +3473,8 @@ def revised_model(): '3 Chanders Rd', '1 Orchard Close', ] + wates_coordination = wates_coordination[~wates_coordination["Name"].isin(wates_to_skip)] + for _, home in tqdm(wates_coordination.iterrows(), total=len(wates_coordination)): # Search the folder @@ -3547,34 +3575,18 @@ def revised_model(): raise Exception("No match") wates_matching_lookup = pd.DataFrame(wates_matching_lookup) + # We get a match for all records + assert wates_matching_lookup.shape[0] == wates_coordination.shape[0] + assert not pd.isnull(wates_matching_lookup["Asset ID"]).sum() + assert not wates_matching_lookup["Asset ID"].duplicated().sum() + # Merge lookup tables onto the coordination sheets wates_coordination = wates_coordination.merge( wates_matching_lookup, how="left", on="Name" ) missed_asset_id = wates_coordination[pd.isnull(wates_coordination["Asset ID_x"])] if not missed_asset_id.empty: - # We fill the missing ids - missing_lookup = { - "4 Sydnall Fields": 31231, - "12 Sydnall Fields": 31239, - "12 Athena Gardens": 28061, - "49 Banner Lane": 41189, - "4 Jonathan Road": 41232, - "8 Jonathan Road": 41236, - "1 Jonathan Road": 41229, - "96 Taunton Way": 31417, - "94 Taunton Way": 31418, - "1 Lady Lane": 29430, - "10 Jonathan Road": 41283, - "21 Jonathan Road": 41246, - "12 Ashcroft Close": 26399 - } - for name, asset_id in missing_lookup.items(): - wates_coordination["Asset ID_x"] = np.where( - wates_coordination["Name"] == name, - asset_id, - wates_coordination["Asset ID_x"] - ) + raise Exception("Missing Asset ID") ccs_coordination = ccs_coordination.merge( ccs_matching_lookup, how="left", on="Name" From 10bc433283417a2c15ffe2924537ded81af240d6 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 3 Feb 2025 16:06:47 +0000 Subject: [PATCH 18/72] assigning properties to bands --- .../stonewater/Wave 3 Preparation.py | 71 ++++++++++++++++--- 1 file changed, 62 insertions(+), 9 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 04078e47..c623e9f7 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3212,10 +3212,10 @@ def revised_model(): "12 Ashcroft Close": 26399 } for name, asset_id in missing_lookup.items(): - wates_coordination["Asset ID_x"] = np.where( + wates_coordination["Asset ID"] = np.where( wates_coordination["Name"] == name, asset_id, - wates_coordination["Asset ID_x"] + wates_coordination["Asset ID"] ) wates_coordination = wates_coordination[~pd.isnull(wates_coordination["Asset ID"])] @@ -3596,6 +3596,16 @@ def revised_model(): matching_lookup, how="left", on="Name" ) + # We have 4 properties in the Wates coordination board, that we want to remove from the retrofit packages board + to_remove = wates_coordination[ + wates_coordination["Asset ID_x"].astype(int).isin(retrofit_packages_board["Organisation Reference"]) + ] + assert to_remove.shape[0] == 4 + # Remove them from the wates board + wates_coordination = wates_coordination[ + ~wates_coordination["Asset ID_x"].astype(int).isin(retrofit_packages_board["Organisation Reference"]) + ] + # We combine this into a singular board coordinated_packages = pd.concat( [ @@ -3662,6 +3672,7 @@ def revised_model(): ) coordinated_packages["Organisation Reference"] = coordinated_packages["Organisation Reference"].astype(int) + assert not coordinated_packages["Organisation Reference"].duplicated().sum() # Merge the property features on coordinated_packages = coordinated_packages.merge( @@ -3670,6 +3681,25 @@ def revised_model(): on="Organisation Reference" ) + # We match the properties to their closest match + # We clean up the SAP ratings in the coordinated packages + def sap_to_number(x): + try: + return int(x) + except: + if x[-1] in ["A", "B", "C", "D", "E", "F"]: + return int(x[:-1]) + + if x[0] in ["A", "B", "C", "D", "E", "F"]: + return int(x[1:]) + + coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Actual SAP Band"])] + coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Actual SAP Rating"])] + + coordinated_packages["Actual SAP Rating"] = coordinated_packages["Actual SAP Rating"].apply( + lambda x: sap_to_number(x) + ) + # We need the features pertaining to these priority postcodes def find_nearest_matching_property(coordinated_packages, home): @@ -3729,11 +3759,9 @@ def revised_model(): no_match_summary = no_match_summary.sort_values("Organisation Reference", ascending=False) # len(no_match) - # 8764, 5607 + # 8764, 5607, 5646 # no_match_summary.shape - # (3953, 6), (2948, 6) - - # We match the properties to their closest match + # (3953, 6), (2948, 6), (2969, 7) matches_df = pd.DataFrame(matches) matches_df = matches_df.merge( @@ -3745,11 +3773,36 @@ def revised_model(): aggregated_matches_df = [] for org_ref, mapped_matches in matches_df.groupby("Organisation Reference"): if mapped_matches.shape[0] == 1: - mapped_matches["Number of matches"] = 1 - mapped_matches["Proportion"] - aggregated_matches_df.append(mapped_matches) + aggregated_matches_df.append( + { + "Organisation Reference": org_ref, + "Number of matches": 1, + "Proportion": 100, + "Estimated SAP Rating": mapped_matches["Actual SAP Rating"].values[0], + "Estimated EPC Rating": sap_to_epc(mapped_matches["Actual SAP Rating"].values[0]) + } + ) continue + # We need to aggregate the matches, since we have multiple + average_rating = mapped_matches["Actual SAP Rating"].mean() + number_of_matches = mapped_matches.shape[0] + average_epc_rating = sap_to_epc(average_rating) + # proportion is the number of properties that have this EPC rating + proportion_with_this_epc = int( + mapped_matches[mapped_matches["Actual SAP Band"] == average_epc_rating].shape[0] / number_of_matches * 100) + aggregated_matches_df.append( + { + "Organisation Reference": org_ref, + "Number of matches": number_of_matches, + "Proportion": proportion_with_this_epc, + "Estimated SAP Rating": average_rating, + "Estimated EPC Rating": average_epc_rating + } + ) + + aggregated_matches_df = pd.DataFrame(aggregated_matches_df) + mapped_priority_list = new_priority_postcodes.merge( matches_df, on="Organisation Reference", ) From 139db23592ea885af14d8734d9cf2e36a1484a59 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 4 Feb 2025 14:04:20 +0000 Subject: [PATCH 19/72] putting together outputs --- .../stonewater/Wave 3 Preparation.py | 346 +++++++++++++++--- etl/route_march_data_pull/app.py | 16 +- survey_report/app.py | 79 ++-- 3 files changed, 360 insertions(+), 81 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index c623e9f7..1748f624 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1,5 +1,6 @@ import os -from pyexpat import features +from urllib import parse +from fuzzywuzzy import fuzz import PyPDF2 import re @@ -2936,6 +2937,14 @@ def identify_incorrect_packages(): ) +def extract_sharepoint_url(x): + if pd.isnull(x): + return "" + return "/".join(parse.urlparse( + x.split(" - http")[1] + ).path.replace("%20", " ").split("/")[-2:]) + + def revised_model(): """ This function implements the revised model for Stonewater, where we are looking at new priority postcodes @@ -2956,6 +2965,7 @@ def revised_model(): original_archetypes = original_archetypes[~pd.isnull(original_archetypes["Address ID"])] original_archetypes = original_archetypes[original_archetypes["Address ID"] != "Address ID"] original_archetypes["Address ID"] = original_archetypes["Address ID"].astype(int) + original_archetypes["UPRN"] = original_archetypes["UPRN"].astype("Int64").astype(str) # Check if we have all of the addresses missed = original_archetypes[ @@ -2965,7 +2975,7 @@ def revised_model(): assert set(missed) == {'NOT PRIORITY POSTCODE', 'IN WAVE 2.1', 'EPC C OR ABOVE'} original_archetypes = original_archetypes[ - ["Address ID", "Archetype ID", "Archetype Group Rank"] + ["Address ID", "Archetype ID", "Archetype Group Rank", "UPRN"] ] # Merge these archetypes on to the new priority postcodes @@ -3104,6 +3114,42 @@ def revised_model(): # Replace \n with "" retrofit_assessment_data["Postcode"] = retrofit_assessment_data["Postcode"].str.replace("\n", "") + retrofit_assessments_data_columns = [ + 'Current SAP Rating', 'Current EPC Band', 'Primary Energy Use (kWh/yr)', + 'Primary Energy Use Intensity (kWh/m2/yr)', 'Number of Storeys', + 'Fuel Bill', 'Window Age Description', + 'Window Age Description Proportion (%)', + 'Secondary Window Age Description', + 'Secondary Window Age Description Proportion (%)', 'Number of Windows', + 'Total Number of Doors', 'Number of Insulated Doors', + 'Existing Primary Heating System', + 'Existing Primary Heating PCDF Reference', + 'Existing Primary Heating Controls', + 'Existing Primary Heating % of Heat', + 'Existing Secondary Heating System', + 'Existing Secondary Heating PCDF Reference', + 'Existing Secondary Heating Controls', + 'Existing Secondary Heating % of Heat', 'Secondary Heating Code', + 'Water Heating Code', 'Total Floor Area (m2)', + 'Total Ground Floor Area (m2)', 'RIR Floor Area', + 'Main Building Wall Area (m2)', 'First Extension Wall Area (m2)', + 'Number of Light Fittings', 'Number of LEL Fittings', + 'Number of fittings needing LEL', 'Main Roof Type', + 'Main Roof Insulation', 'Main Roof Insulation Thickness', + 'Main Wall Type', 'Main Wall Insulation', 'Main Wall Dry-lining', + 'Main Wall Thickness', 'Main Building Alternative Wall Type', + 'Main Building Alternative Wall Insulation', + 'Main Building Alternative Wall Dry-lining', + 'Main Building Alternative Wall Thickness', 'Main Fuel' + ] + # For the columns in retrofit_assessments_data_columns, prefix all of them with Survey: + retrofit_assessments_data_columns_prefixed = ["Survey: " + x for x in retrofit_assessments_data_columns] + rename_dict = dict(zip(retrofit_assessments_data_columns, retrofit_assessments_data_columns_prefixed)) + retrofit_assessment_data = retrofit_assessment_data.rename(columns=rename_dict) + retrofit_assessment_data["Survey: Current EPC Band"] = ( + retrofit_assessment_data["Survey: Current SAP Rating"].apply(lambda x: sap_to_epc(x)) + ) + # We can read in the data as needed # Next Step: Read in the coordinated measures and match to the extracted data @@ -3134,14 +3180,6 @@ def revised_model(): ccs_coordination_sheet = ccs_coordination_sheet.head(87) ccs_coordination = pd.concat([ccs_coordination_removed_from_programme, ccs_coordination_sheet]) - from urllib import parse - def extract_sharepoint_url(x): - if pd.isnull(x): - return "" - return "/".join(parse.urlparse( - x.split(" - http")[1] - ).path.replace("%20", " ").split("/")[-2:]) - ccs_coordination["folder_path"] = ccs_coordination["Sharepoint Link"].apply(lambda x: extract_sharepoint_url(x)) ############################################################ @@ -3224,8 +3262,6 @@ def revised_model(): lambda x: extract_sharepoint_url(x) ) - # Combine the data back - ############################################################ # NEW 450 COORDINATED RETROFIT ASSESSMENTS ############################################################# @@ -3352,7 +3388,6 @@ def revised_model(): ) ccs_coordination = ccs_coordination[~pd.isnull(ccs_coordination["Postcode"])] ccs_coordination = ccs_coordination[ccs_coordination["Retrofit Assessment"] != "Outstanding"] - from fuzzywuzzy import fuzz ccs_manual_filters = { "35 Kittiwake Close": "Wave 2.1 Surveys/11. CCS Dorset/Kittiwake Close 35" @@ -3596,6 +3631,17 @@ def revised_model(): matching_lookup, how="left", on="Name" ) + # We now map the retrofit assessment data to the coordinated packages + wates_coordination = wates_coordination.merge( + retrofit_assessment_data.drop(columns=["Postcode"]), how="left", on="survey_folder" + ) + ccs_coordination = ccs_coordination.merge( + retrofit_assessment_data.drop(columns=["Postcode"]), how="left", on="survey_folder" + ) + retrofit_packages_board = retrofit_packages_board.merge( + retrofit_assessment_data.drop(columns=["Postcode"]), how="left", on="survey_folder" + ) + # We have 4 properties in the Wates coordination board, that we want to remove from the retrofit packages board to_remove = wates_coordination[ wates_coordination["Asset ID_x"].astype(int).isin(retrofit_packages_board["Organisation Reference"]) @@ -3617,8 +3663,8 @@ def revised_model(): 'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade', 'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls', 'Solar PV', 'Other measures', 'Organisation Reference', - ] - ], + ] + retrofit_assessments_data_columns_prefixed + ], ccs_coordination[ [ # We don't have secondary wall insulation, Flat Roof, RIR, Heating Controls, @@ -3627,8 +3673,8 @@ def revised_model(): 'SAP Band Install Package', 'Package Approved (Client)', 'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade', 'Ventilation', 'Heating', 'Other Measures', "Asset ID.1_y", - ] - ].rename( + ] + retrofit_assessments_data_columns_prefixed + ].rename( columns={ "SAP Band Pre": "Actual SAP Band", "SAP Rating Pre": "Actual SAP Rating", @@ -3651,8 +3697,8 @@ def revised_model(): 'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade', 'Ventilation', 'Heating', 'Other Measures', 'Asset ID_x' - ] - ].rename( + ] + retrofit_assessments_data_columns_prefixed + ].rename( columns={ "SAP Band Pre": "Actual SAP Band", "SAP Rating Pre": "Actual SAP Rating", @@ -3681,24 +3727,8 @@ def revised_model(): on="Organisation Reference" ) - # We match the properties to their closest match - # We clean up the SAP ratings in the coordinated packages - def sap_to_number(x): - try: - return int(x) - except: - if x[-1] in ["A", "B", "C", "D", "E", "F"]: - return int(x[:-1]) - - if x[0] in ["A", "B", "C", "D", "E", "F"]: - return int(x[1:]) - - coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Actual SAP Band"])] - coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Actual SAP Rating"])] - - coordinated_packages["Actual SAP Rating"] = coordinated_packages["Actual SAP Rating"].apply( - lambda x: sap_to_number(x) - ) + coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Survey: Current EPC Band"])] + coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Survey: Current SAP Rating"])] # We need the features pertaining to these priority postcodes @@ -3721,6 +3751,11 @@ def revised_model(): if not match.empty: return match + # Finally, we search for a property in the same Archetype + match = coordinated_packages[coordinated_packages["Archetype ID"] == home["Archetype ID"]] + if not match.empty: + return match + return None # No match found coordinated_packages["Postal Region"] = coordinated_packages["Postcode"].str.split(" ").str[0].str.strip() @@ -3732,6 +3767,12 @@ def revised_model(): coordinated_packages["Primary Property Type"] = coordinated_packages["Property Type"].str.split(":").str[0] new_priority_postcodes["Primary Property Type"] = new_priority_postcodes["Property Type"].str.split(":").str[0] + coordinated_packages = coordinated_packages.merge( + new_priority_postcodes[["Organisation Reference", "Archetype ID"]], + how="left", + on="Organisation Reference" + ) + # For every property in the priority postcodes data, we look for a most appropriate matching property no_match = [] matches = [] @@ -3759,16 +3800,17 @@ def revised_model(): no_match_summary = no_match_summary.sort_values("Organisation Reference", ascending=False) # len(no_match) - # 8764, 5607, 5646 + # 8764, 5607, 5646, 5071 # no_match_summary.shape - # (3953, 6), (2948, 6), (2969, 7) + # (3953, 6), (2948, 6), (2969, 7), (2575, 7) matches_df = pd.DataFrame(matches) matches_df = matches_df.merge( - coordinated_packages[["Organisation Reference", "Actual SAP Band", "Actual SAP Rating"]], + coordinated_packages[["Organisation Reference", "Survey: Current EPC Band", "Survey: Current SAP Rating"]], left_on="Best Match Organisation Reference", right_on="Organisation Reference", suffixes=("", " - Closest Match") ) + # We want to aggregate the matches, when we have multiple aggregated_matches_df = [] for org_ref, mapped_matches in matches_df.groupby("Organisation Reference"): @@ -3778,19 +3820,21 @@ def revised_model(): "Organisation Reference": org_ref, "Number of matches": 1, "Proportion": 100, - "Estimated SAP Rating": mapped_matches["Actual SAP Rating"].values[0], - "Estimated EPC Rating": sap_to_epc(mapped_matches["Actual SAP Rating"].values[0]) + "Estimated SAP Rating": mapped_matches["Survey: Current SAP Rating"].values[0], + "Estimated EPC Rating": mapped_matches["Survey: Current EPC Band"].values[0] } ) continue # We need to aggregate the matches, since we have multiple - average_rating = mapped_matches["Actual SAP Rating"].mean() + average_rating = mapped_matches["Survey: Current SAP Rating"].mean() number_of_matches = mapped_matches.shape[0] average_epc_rating = sap_to_epc(average_rating) # proportion is the number of properties that have this EPC rating proportion_with_this_epc = int( - mapped_matches[mapped_matches["Actual SAP Band"] == average_epc_rating].shape[0] / number_of_matches * 100) + mapped_matches[mapped_matches["Survey: Current EPC Band"] == average_epc_rating].shape[ + 0] / number_of_matches * 100 + ) aggregated_matches_df.append( { "Organisation Reference": org_ref, @@ -3804,12 +3848,220 @@ def revised_model(): aggregated_matches_df = pd.DataFrame(aggregated_matches_df) mapped_priority_list = new_priority_postcodes.merge( - matches_df, on="Organisation Reference", + aggregated_matches_df, on="Organisation Reference", how="left" ) - # We merge on the EPC ratings for the matched properties - mapped_priority_list = mapped_priority_list.merge( + mapped_priority_list["address1"] = mapped_priority_list["Address"].str.split(",").str[0] + + # If we have a leading number like 01, 02, 03, 04, 05, 06, 07, 08, 09, we remove the leading 0 + + def remove_leading_zero(address): + return re.sub(r"^0([1-9]) ", r"\1 ", address) + + # Example usage + mapped_priority_list["address1"] = mapped_priority_list["address1"].apply(remove_leading_zero) + mapped_priority_list["address1"] = np.where( + mapped_priority_list["Organisation Reference"] == 37004, + "8 Mason Road", + mapped_priority_list["address1"] ) + mapped_priority_list["address1"] = np.where( + mapped_priority_list["Organisation Reference"] == 37003, + "9 Mason Road", + mapped_priority_list["address1"] + ) + + mapped_priority_list = mapped_priority_list.rename( + columns={"UPRN": "uprn"} + ) + mapped_priority_list["row_id"] = mapped_priority_list["Organisation Reference"] + + # Let's get the newest EPC data for these properties + # We merge on UPRN, when we have it + # from etl.route_march_data_pull.app import get_data + # epc_data, errors, nodata = get_data( + # asset_list=mapped_priority_list, + # fulladdress_column="Address", + # address1_column="address1", + # postcode_column="Postcode", + # manual_uprn_map={}, + # epc_api_only=True + # ) + # + # epc_df = pd.DataFrame(epc_data) + # epc_df.to_csv( + # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "full_epc_data.csv"), index=False + # ) + epc_df = pd.read_csv(os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "full_epc_data.csv")) + epc_df = epc_df.rename(columns={"row_id": "Organisation Reference"}) + + # We now package up the data + + # Sheet 1 is the base coordination data + output_coordination_sheet = coordinated_packages[ + [ + "Name", "Postcode", 'Organisation Reference', 'Package Ref', + 'Main Wall Insulation', 'Secondary Wall Insulation', 'Loft insulation', + 'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade', + 'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls', + 'Solar PV', 'Other measures', 'Survey: Current SAP Rating', 'Survey: Current EPC Band', + 'Survey: Primary Energy Use (kWh/yr)', + 'Survey: Primary Energy Use Intensity (kWh/m2/yr)', + 'Survey: Number of Storeys', 'Survey: Fuel Bill', + 'Survey: Window Age Description', + 'Survey: Window Age Description Proportion (%)', + 'Survey: Secondary Window Age Description', + 'Survey: Secondary Window Age Description Proportion (%)', + 'Survey: Number of Windows', 'Survey: Total Number of Doors', + 'Survey: Number of Insulated Doors', + 'Survey: Existing Primary Heating System', + 'Survey: Existing Primary Heating PCDF Reference', + 'Survey: Existing Primary Heating Controls', + 'Survey: Existing Primary Heating % of Heat', + 'Survey: Existing Secondary Heating System', + 'Survey: Existing Secondary Heating PCDF Reference', + 'Survey: Existing Secondary Heating Controls', + 'Survey: Existing Secondary Heating % of Heat', + 'Survey: Secondary Heating Code', 'Survey: Water Heating Code', + 'Survey: Total Floor Area (m2)', 'Survey: Total Ground Floor Area (m2)', + 'Survey: RIR Floor Area', 'Survey: Main Building Wall Area (m2)', + 'Survey: First Extension Wall Area (m2)', + 'Survey: Number of Light Fittings', 'Survey: Number of LEL Fittings', + 'Survey: Number of fittings needing LEL', 'Survey: Main Roof Type', + 'Survey: Main Roof Insulation', + 'Survey: Main Roof Insulation Thickness', 'Survey: Main Wall Type', + 'Survey: Main Wall Insulation', 'Survey: Main Wall Dry-lining', + 'Survey: Main Wall Thickness', + 'Survey: Main Building Alternative Wall Type', + 'Survey: Main Building Alternative Wall Insulation', + 'Survey: Main Building Alternative Wall Dry-lining', + 'Survey: Main Building Alternative Wall Thickness', + 'Survey: Main Fuel', + 'Walls', 'Roofs', 'Heating', 'Main Fuel', 'Age', 'Property Type' + ] + ].rename( + columns={ + 'Walls': "Parity - Walls", + 'Roofs': "Parity - Roof", + 'Heating': "Parity - Heating", + 'Main Fuel': "Parity - Fuel", + 'Age': "Parity - Age Band", + 'Property Type': "Parity - Property Type" + } + ) + + # Sheet 2 is the lookup table which maps the properties to their closest match + # We need to bring in the parity attributes between the mapped properties so we can see side-by-side + mapped_lookup = matches_df[ + [ + 'Organisation Reference', + 'Best Match Organisation Reference', + 'Survey: Current EPC Band', + 'Survey: Current SAP Rating' + ] + ].rename( + columns={ + 'Best Match Organisation Reference': "Best Match - Organisation Reference", + "Survey: Current EPC Band": "Best Match - Survey: Current EPC Band", + 'Survey: Current SAP Rating': "Best Match - Survey: Current SAp Rating" + } + ).merge( + features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type"]], + how="left", + on="Organisation Reference" + ).merge( + features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type"]].rename( + columns={ + "Organisation Reference": "Best Match - Organisation Reference", + "Walls": "Best Match - Walls", + "Roofs": "Best Match - Roof", + "Heating": "Best Match - Heating", + "Main Fuel": "Best Match - Main Fuel", + "Age": "Best Match - Age", + "Property Type": "Best Match - Property Type" + } + ), + how="left", + on="Best Match - Organisation Reference" + ).merge( + coordinated_packages[ + [ + "Organisation Reference", 'Survey: Main Wall Type', 'Survey: Main Wall Insulation', + 'Survey: Main Roof Type', 'Survey: Main Roof Insulation', 'Survey: Main Roof Insulation Thickness', + 'Survey: Existing Primary Heating System', + ] + ].rename( + columns={ + "Organisation Reference": "Best Match - Organisation Reference", + 'Survey: Main Wall Type': 'Best Match - Survey: Main Wall Type', + 'Survey: Main Wall Insulation': 'Best Match - Survey: Main Wall Insulation', + 'Survey: Main Roof Type': 'Best Match - Survey: Main Roof Type', + 'Survey: Main Roof Insulation': 'Best Match - Survey: Main Roof Insulation', + 'Survey: Main Roof Insulation Thickness': 'Best Match - Survey: Main Roof Insulation Thickness', + 'Survey: Existing Primary Heating System': 'Best Match - Survey: Existing Primary Heating System', + } + ), + how="left", + on="Best Match - Organisation Reference" + ) + + # Finally, we have the property, against the mapped home with the estimate SAP scores and the EPC data + worksheet = mapped_priority_list[ + [ + 'Organisation Reference', 'Address', 'Postcode', 'Address ID', 'uprn', 'Archetype ID', + 'SAP', 'SAP Band', "Property Type", "Walls", "Roofs", 'Glazing', + 'Heating', 'Main Fuel', 'Hot Water', 'Estimated SAP Rating', 'Estimated EPC Rating' + ] + ].rename( + columns={ + "SAP": "Parity - SAP Rating", + "SAP Band": "Parity - EPC Rating", + "Property Type": "Parity - Property Type", + "Walls": "Parity - Walls", + "Roofs": "Parity - Roofs", + 'Glazing': "Parity - Glazing", + 'Heating': 'Parity - Heating', + 'Main Fuel': 'Parity - Main Fuel', + 'Hot Water': 'Parity - Hot Water', + } + ).merge( + epc_df[ + [ + "Organisation Reference", + "uprn", + "current-energy-efficiency", + "current-energy-rating", + "lodgement-date", + "construction-age-band", + "walls-description", + "roof-description", + "mainheat-description", + "windows-description", + "hotwater-description", + "main-fuel", + "total-floor-area", + ] + ].rename( + columns={ + "uprn": "Last EPC - uprn", + "current-energy-efficiency": "Last EPC - SAP Score", + "current-energy-rating": "Last EPC - EPC Rating", + "lodgement-date": "Last EPC - Date Lodged", + "construction-age-band": "Last EPC - Age Band", + "walls-description": "Last EPC - Walls", + "roof-description": "Last EPC - Roof", + "mainheat-description": "Last EPC - Heating", + "windows-description": "Last EPC - Windows", + "hotwater-description": "Last EPC - Hot Water", + "main-fuel": "Last EPC - Main Fuel", + "total-floor-area": "Last EPC - Total Floor Area" + } + ), + how="left", + on='Organisation Reference' + ) + + worksheet["Years Since Last EPC"] # if __name__ == "__main__": # main() diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 247ce98c..3432b744 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -20,7 +20,7 @@ load_dotenv(dotenv_path="backend/.env") EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") -def get_data(asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map): +def get_data(asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map, epc_api_only=True): epc_data = [] errors = [] no_epc = [] @@ -33,6 +33,11 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column, m if house_no is None: house_no = house_number uprn = manual_uprn_map.get(full_address, None) + if uprn is None and home.get("uprn"): + uprn = home["uprn"] + + if pd.isnull(uprn): + uprn = None searcher = SearchEpc( address1=str(house_no), @@ -88,6 +93,15 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column, m no_epc.append(home["row_id"]) continue + if epc_api_only: + epc = { + "row_id": home["row_id"], + **searcher.newest_epc.copy() + } + + epc_data.append(epc) + continue + # Look for EPC recommendatons try: property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) diff --git a/survey_report/app.py b/survey_report/app.py index 87ce7864..be31bd52 100644 --- a/survey_report/app.py +++ b/survey_report/app.py @@ -1,6 +1,9 @@ import os import PyPDF2 from string import Template + +import pandas as pd + from survey_report.extraction.detect_report_type import detect_report_type from survey_report.extraction.quidos import SiteNotesExtractor, EPRExtractor @@ -34,44 +37,54 @@ def handle(): :return: """ - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2" + folders = [ + "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 1", + "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2", + "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 3", + "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 4", + "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 5", + ] + data = [] + for data_folder in folders: - folder_contents = os.listdir(data_folder) - # We look for the following files: - # Site notes - file_mapping = {} - for file in folder_contents: - # Check if it's a pdf file - if not file.endswith(".pdf"): - continue - filepath = os.path.join(data_folder, file) - with (open(filepath, "rb") as f): - pdf = PyPDF2.PdfReader(f) - first_page = pdf.pages[0].extract_text() - text = "" - for page in pdf.pages: - text += page.extract_text() + folder_contents = os.listdir(data_folder) + # We look for the following files: + # Site notes + file_mapping = {} + for file in folder_contents: + # Check if it's a pdf file + if not file.endswith(".pdf"): + continue + filepath = os.path.join(data_folder, file) + with (open(filepath, "rb") as f): + pdf = PyPDF2.PdfReader(f) + first_page = pdf.pages[0].extract_text() + text = "" + for page in pdf.pages: + text += page.extract_text() - # Check the report type - report_type = detect_report_type(first_page) - if report_type is not None: - file_mapping[report_type] = text + # Check the report type + report_type = detect_report_type(first_page) + if report_type is not None: + file_mapping[report_type] = text - # This is only set up to work with quido site notes so we must have it - site_notes_extractor = SiteNotesExtractor(file_mapping["quidos_site_notes"]) - site_notes = site_notes_extractor.extract_all() + # This is only set up to work with quido site notes so we must have it + site_notes_extractor = SiteNotesExtractor(file_mapping["quidos_site_notes"]) + site_notes = site_notes_extractor.extract_all() - # We also must have an EPR - epr_extractor = EPRExtractor(file_mapping["quidos_epr"]) - epr = epr_extractor.extract_all() + # We also must have an EPR + epr_extractor = EPRExtractor(file_mapping["quidos_epr"]) + epr = epr_extractor.extract_all() - # We now produce the combined data sheet which is the starting figure: - data_sheet = {**epr, **site_notes} - del data_sheet['Building Dimensions'] - # We unnest the Total Building Dimensions - data_sheet["Total Building Floor Area (m2)"] = data_sheet["Total Building Dimensions"]["floor_area"] - data_sheet["Total Building Heat Loss Area (m2)"] = data_sheet["Total Building Dimensions"]["heat_loss_area"] - del data_sheet["Total Building Dimensions"] + # We now produce the combined data sheet which is the starting figure: + data_sheet = {**epr, **site_notes} + del data_sheet['Building Dimensions'] + # We unnest the Total Building Dimensions + data_sheet["Total Building Floor Area (m2)"] = data_sheet["Total Building Dimensions"]["floor_area"] + data_sheet["Total Building Heat Loss Area (m2)"] = data_sheet["Total Building Dimensions"]["heat_loss_area"] + del data_sheet["Total Building Dimensions"] + data.append(data_sheet) + data = pd.DataFrame(data) # Generate the HTML report # Placeholder locations From 7885467fa40240a2a2632b4b6e120cce5a047c61 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 4 Feb 2025 14:35:24 +0000 Subject: [PATCH 20/72] formatting output --- .../stonewater/Wave 3 Preparation.py | 42 +++++++++++++++++-- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 1748f624..fcde164e 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3777,6 +3777,21 @@ def revised_model(): no_match = [] matches = [] for _, home in tqdm(new_priority_postcodes.iterrows(), total=len(new_priority_postcodes)): + + # We check if the property was surveyed + survey_result = coordinated_packages[ + coordinated_packages["Organisation Reference"] == home["Organisation Reference"] + ] + if not survey_result.empty: + to_extend = [ + { + "Organisation Reference": home["Organisation Reference"], + "Best Match Organisation Reference": m, + "Was Surveyed": True + } for m in survey_result["Organisation Reference"].values + ] + matches.extend(to_extend) + closest_match = find_nearest_matching_property(coordinated_packages, home) if closest_match is None: no_match.append(home["Organisation Reference"]) @@ -3785,7 +3800,8 @@ def revised_model(): to_extend = [ { "Organisation Reference": home["Organisation Reference"], - "Best Match Organisation Reference": m + "Best Match Organisation Reference": m, + "Was Surveyed": False } for m in closest_match["Organisation Reference"].values ] matches.extend(to_extend) @@ -4010,7 +4026,8 @@ def revised_model(): [ 'Organisation Reference', 'Address', 'Postcode', 'Address ID', 'uprn', 'Archetype ID', 'SAP', 'SAP Band', "Property Type", "Walls", "Roofs", 'Glazing', - 'Heating', 'Main Fuel', 'Hot Water', 'Estimated SAP Rating', 'Estimated EPC Rating' + 'Heating', 'Main Fuel', 'Hot Water', 'Number of matches', 'Proportion', + 'Estimated SAP Rating', 'Estimated EPC Rating' ] ].rename( columns={ @@ -4023,6 +4040,7 @@ def revised_model(): 'Heating': 'Parity - Heating', 'Main Fuel': 'Parity - Main Fuel', 'Hot Water': 'Parity - Hot Water', + 'Proportion': 'Proportion of matched properties with same EPC rating', } ).merge( epc_df[ @@ -4061,7 +4079,25 @@ def revised_model(): on='Organisation Reference' ) - worksheet["Years Since Last EPC"] + worksheet["Years Since Last EPC"] = pd.Timestamp.now().year - pd.to_datetime( + worksheet["Last EPC - Date Lodged"]).dt.year + + worksheet["Last EPC - uprn"] = worksheet["Last EPC - uprn"].astype("Int64").astype(str) + + worksheet["uprn"] = np.where( + pd.isnull(worksheet["uprn"]) & pd.notnull(worksheet["Last EPC - uprn"]), + worksheet["Last EPC - uprn"], + worksheet["uprn"] + ) + + worksheet["uprn"] = worksheet["uprn"].replace("", "") + + # Save to Excel with multiple sheets + excel_path = os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "04022025 Stonewater Priority List.xlsx") + with pd.ExcelWriter(excel_path, engine="xlsxwriter") as writer: + worksheet.to_excel(writer, sheet_name="Worksheet", index=False, header=True) + mapped_lookup.to_excel(writer, sheet_name="Lookup Table", index=False, header=True) + output_coordination_sheet.to_excel(writer, sheet_name="Coordination", index=False, header=True) # if __name__ == "__main__": # main() From 77844c625eb1b00f140c1f64224b4101a51e1ca5 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 10 Feb 2025 15:41:33 +0000 Subject: [PATCH 21/72] minor --- etl/customers/panacap/assets.py | 61 +++++ etl/customers/remote_assessments/app.py | 34 +-- .../stonewater/Wave 3 Preparation.py | 16 +- .../stonewater/potential_eco_properties.py | 250 ++++++++---------- etl/find_my_epc/RetrieveFindMyEpc.py | 19 +- etl/route_march_data_pull/app.py | 149 ++++++++--- recommendations/Recommendations.py | 2 + 7 files changed, 324 insertions(+), 207 deletions(-) create mode 100644 etl/customers/panacap/assets.py diff --git a/etl/customers/panacap/assets.py b/etl/customers/panacap/assets.py new file mode 100644 index 00000000..ec57d9a4 --- /dev/null +++ b/etl/customers/panacap/assets.py @@ -0,0 +1,61 @@ +import os + +import pandas as pd +from dotenv import load_dotenv + +from etl.spatial.OpenUprnClient import OpenUprnClient +from etl.route_march_data_pull.app import get_data + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + +addresses = [ + {"address": "3 Willis Road", "postcode": "CB1 2AQ"}, + {"address": "22 Catharine Street", "postcode": "CB1 3AW"}, + {"address": "332 Mill Road", "postcode": "CB1 3NN"}, + {"address": "330 Mill Road", "postcode": "CB1 3NN"}, + {"address": "328 Mill Road", "postcode": "CB1 3NN"}, + {"address": "71 Mill Road", "postcode": "CB1 2AS"}, + {"address": "78 Argyle Street", "postcode": "CB1 3LZ"}, + {"address": "9 Graham Road", "postcode": "CB4 2ZE"}, + {"address": "217 Mill Road", "postcode": "CB1 3BE"}, + {"address": "374 Mill Road", "postcode": "CB1 3NN"}, + {"address": "174 Thoday Street", "postcode": "CB1 3AX"}, + {"address": "37 Abbey Road", "postcode": "CB5 8HH"}, + {"address": "18 Upper Gwydir Street", "postcode": "CB1 2LR"}, + {"address": "21 Fulbourn Road Fulbourn", "postcode": "CB1 9JL"}, + {"address": "108 Argyle Street", "postcode": "CB1 3LS"}, + {"address": "115 Victoria Road", "postcode": "CB4 3BS"}, + {"address": "55 Ross Street", "postcode": "CB1 3BP"}, + {"address": "16 Kingston Street", "postcode": "CB1 2NU"}, + {"address": "13 Thoday Street", "postcode": "CB1 3AS"}, + {"address": "103 York Street", "postcode": "CB1 2PZ"}, +] + +asset_list = pd.DataFrame(addresses) +asset_list["row_id"] = asset_list.index + +epc_data, _, _ = get_data( + asset_list=asset_list, fulladdress_column="address", postcode_column="postcode", address1_column="address", + manual_uprn_map={}, epc_api_only=True +) + +epc_df = pd.DataFrame(epc_data) +epc_df.shape + +asset_list = asset_list.merge( + epc_df, how="left", on="row_id" +) + +asset_list = asset_list.rename(columns={"address_x": "Address", "postcode_x": "Postcode"}) +asset_list["uprn"] = asset_list["uprn"].astype(str) + +spatial_data = OpenUprnClient.get_spatial_data([x["uprn"] for x in epc_data], bucket_name="retrofit-data-dev") +spatial_data["UPRN"] = spatial_data["UPRN"].astype(str) + +asset_list = asset_list.merge( + spatial_data, how="left", left_on="uprn", right_on="UPRN" +) + +asset_list.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Panacap/Acquisitions EPC Data.csv", + index=False) diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py index 13cdc41b..e1298565 100644 --- a/etl/customers/remote_assessments/app.py +++ b/etl/customers/remote_assessments/app.py @@ -4,7 +4,7 @@ from dotenv import load_dotenv from utils.s3 import save_csv_to_s3 from etl.find_my_epc.AssetListEpcData import AssetListEpcData -PORTFOLIO_ID = 126 +PORTFOLIO_ID = 127 USER_ID = 8 load_dotenv(dotenv_path="backend/.env") @@ -19,22 +19,9 @@ def app(): asset_list = [ { - "address": "Garden Flat, 48 Bedminster Parade", - "postcode": "BS3 4HS", - "building_id": 1, - "uprn": 308249, - }, - { - "address": "Top Floor Flat, 48 Bedminster Parade", - "postcode": "BS3 4HS", - "building_id": 1, - "uprn": 308251 - }, - { - "address": "First Floor Flat, 48 Bedminster Parade", - "postcode": "BS3 4HS", - "building_id": 1, - "uprn": 308250, + "address": "49 Brailsford Road", + "postcode": "M14 6PT", + "uprn": 77145666, } ] asset_list = pd.DataFrame(asset_list) @@ -65,18 +52,7 @@ def app(): valuation_data = [ { - "address": "Garden Flat, 48 Bedminster Parade", - "postcode": "BS3 4HS", - "valuation": 337_000 - }, - { - "addresss": "Top Floor Flat, 48 Bedminster Parade", - "postcode": "BS3 4HS", - "valuation": 337_000 - }, - { - "address": "First Floor Flat, 48 Bedminster Parade", - "postcode": "BS3 4HS", + "uprn": 77145666, "valuation": 337_000 } ] diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index fcde164e..b2a92e4c 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3777,7 +3777,6 @@ def revised_model(): no_match = [] matches = [] for _, home in tqdm(new_priority_postcodes.iterrows(), total=len(new_priority_postcodes)): - # We check if the property was surveyed survey_result = coordinated_packages[ coordinated_packages["Organisation Reference"] == home["Organisation Reference"] @@ -3791,6 +3790,7 @@ def revised_model(): } for m in survey_result["Organisation Reference"].values ] matches.extend(to_extend) + continue closest_match = find_nearest_matching_property(coordinated_packages, home) if closest_match is None: @@ -3821,6 +3821,7 @@ def revised_model(): # (3953, 6), (2948, 6), (2969, 7), (2575, 7) matches_df = pd.DataFrame(matches) + matches_df = matches_df.merge( coordinated_packages[["Organisation Reference", "Survey: Current EPC Band", "Survey: Current SAP Rating"]], left_on="Best Match Organisation Reference", right_on="Organisation Reference", @@ -3837,7 +3838,8 @@ def revised_model(): "Number of matches": 1, "Proportion": 100, "Estimated SAP Rating": mapped_matches["Survey: Current SAP Rating"].values[0], - "Estimated EPC Rating": mapped_matches["Survey: Current EPC Band"].values[0] + "Estimated EPC Rating": mapped_matches["Survey: Current EPC Band"].values[0], + "Was Surveyed": mapped_matches["Was Surveyed"].values[0], } ) continue @@ -3857,7 +3859,8 @@ def revised_model(): "Number of matches": number_of_matches, "Proportion": proportion_with_this_epc, "Estimated SAP Rating": average_rating, - "Estimated EPC Rating": average_epc_rating + "Estimated EPC Rating": average_epc_rating, + "Was Surveyed": False } ) @@ -3973,7 +3976,8 @@ def revised_model(): 'Organisation Reference', 'Best Match Organisation Reference', 'Survey: Current EPC Band', - 'Survey: Current SAP Rating' + 'Survey: Current SAP Rating', + "Was Surveyed" ] ].rename( columns={ @@ -4027,7 +4031,7 @@ def revised_model(): 'Organisation Reference', 'Address', 'Postcode', 'Address ID', 'uprn', 'Archetype ID', 'SAP', 'SAP Band', "Property Type", "Walls", "Roofs", 'Glazing', 'Heating', 'Main Fuel', 'Hot Water', 'Number of matches', 'Proportion', - 'Estimated SAP Rating', 'Estimated EPC Rating' + 'Estimated SAP Rating', 'Estimated EPC Rating', "Was Surveyed" ] ].rename( columns={ @@ -4092,6 +4096,8 @@ def revised_model(): worksheet["uprn"] = worksheet["uprn"].replace("", "") + worksheet = worksheet.drop(columns=["Last EPC - uprn"]) + # Save to Excel with multiple sheets excel_path = os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "04022025 Stonewater Priority List.xlsx") with pd.ExcelWriter(excel_path, engine="xlsxwriter") as writer: diff --git a/etl/customers/stonewater/potential_eco_properties.py b/etl/customers/stonewater/potential_eco_properties.py index bda9c30c..eef82eae 100644 --- a/etl/customers/stonewater/potential_eco_properties.py +++ b/etl/customers/stonewater/potential_eco_properties.py @@ -217,78 +217,7 @@ def app(): ) ) - # We get the EPC data - # epc_data = json.loads( - # read_from_s3( - # bucket_name="retrofit-data-dev", - # s3_file_name="customers/Stonewater/clustering/epc_data.json" - # ) - # ) - # epc_data = pd.DataFrame(epc_data) - # - # epc_data["uprn"] = np.where( - # epc_data["internal_id"] == 1091, - # 83143766, - # epc_data["uprn"] - # ) - # - # epc_data_batch_2 = read_pickle_from_s3( - # s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl", - # bucket_name="retrofit-data-dev" - # ) - # epc_data_batch_2 = pd.DataFrame(epc_data_batch_2) - # - # complete_epcs = pd.concat([epc_data, epc_data_batch_2]) - # - # epcs_to_merge = complete_epcs[ - # [ - # "uprn", - # "address", - # "postcode", - # "property-type", - # "built-form", - # "inspection-date", - # "current-energy-rating", - # "current-energy-efficiency", - # "roof-description", - # "walls-description", - # "transaction-type", - # "secondheat-description", - # "total-floor-area", - # "construction-age-band", - # "floor-height", - # "number-habitable-rooms", - # "mainheat-description", - # "energy-consumption-current" - # ] - # ].rename( - # columns={ - # "address": "Address", - # "postcode": "Postcode", - # "inspection-date": "Date of last EPC", - # "current-energy-efficiency": "SAP score on register", - # "current-energy-rating": "EPC rating on register", - # "property-type": "Property Type", - # "built-form": "Archetype", - # "total-floor-area": "Property Floor Area", - # "construction-age-band": "Property Age Band", - # "floor-height": "Property Floor Height", - # "number-habitable-rooms": "Number of Habitable Rooms", - # "walls-description": "Wall Construction", - # "roof-description": "Roof Construction", - # "mainheat-description": "Heating Type", - # "secondheat-description": "Secondary Heating", - # "transaction-type": "Reason for last EPC", - # "energy-consumption-current": "Heat Demand (kWh/m2)", - # } - # ) - # # We de-dupe, taking the newest on the date the EPC was lod - # epcs_to_merge["Date of last EPC"] = pd.to_datetime(epcs_to_merge["Date of last EPC"]) - # epcs_to_merge = epcs_to_merge.sort_values("Date of last EPC", ascending=False) - # epcs_to_merge = epcs_to_merge.drop_duplicates(subset="uprn") - stonewater_cavity_properties["UPRN"] = stonewater_cavity_properties["UPRN"].astype("Int64").astype(str) - stonewater_cavity_properties["Reason Included"].value_counts() # Find the postcodes where an Osmosis survey revealed a need for CWI postcodes_found_needing_cwi = stonewater_cavity_properties[ stonewater_cavity_properties["Reason Included"].isin( @@ -339,12 +268,7 @@ def app(): "Renewables": "Parity - Renewables", "Total Floor Area": "Parity - Total Floor Area" } - ) # .merge( - # epcs_to_merge, - # how="left", - # left_on="UPRN", - # right_on="uprn" - # ) + ) # We now flag the additional properties in the as built list @@ -434,12 +358,11 @@ def app(): additional_properties["Suspected Needs CWI - not surveyed"] = ( ( - additional_properties["Postcode"].isin(postcodes_found_needing_cwi) + additional_properties["Postcode"].isin(postcodes_found_needing_cwi) & + ~additional_properties["Installed under ECO3"] ) ) - additional_properties["Same Postcode as Installed under ECO3"].value_counts() - # We drop Full Address additional_properties = additional_properties.drop(columns=["Full Address"]) additional_properties2 = additional_properties[[ @@ -461,65 +384,57 @@ def app(): "Renewables": "Parity - Renewables", "Total Floor Area": "Parity - Total Floor Area" } - ) # .merge( - # pd.DataFrame(additional_properties_epcs)[ - # [ - # "row_id", - # "property-type", - # "built-form", - # "inspection-date", - # "current-energy-rating", - # "current-energy-efficiency", - # "roof-description", - # "walls-description", - # "transaction-type", - # "secondheat-description", - # "total-floor-area", - # "construction-age-band", - # "floor-height", - # "number-habitable-rooms", - # "mainheat-description", - # "energy-consumption-current" - # ] - # ].rename( - # columns={ - # "inspection-date": "Date of last EPC", - # "current-energy-efficiency": "SAP score on register", - # "current-energy-rating": "EPC rating on register", - # "property-type": "Property Type", - # "built-form": "Archetype", - # "total-floor-area": "Property Floor Area", - # "construction-age-band": "Property Age Band", - # "floor-height": "Property Floor Height", - # "number-habitable-rooms": "Number of Habitable Rooms", - # "walls-description": "Wall Construction", - # "roof-description": "Roof Construction", - # "mainheat-description": "Heating Type", - # "secondheat-description": "Secondary Heating", - # "transaction-type": "Reason for last EPC", - # "energy-consumption-current": "Heat Demand (kWh/m2)", - # } - # ), - # how="left", - # on="row_id" - # ) + ) + + # Combine the data: + full_dataset = pd.concat([stonewater_cavity_properties, additional_properties2]) + + # We not define the priority list for non-intrusives + full_dataset["Postal Region"] = full_dataset["Postcode"].str.split(" ").str[0].str[0:2] + full_dataset["Postal Region 2"] = full_dataset["Postcode"].str.split(" ").str[0] + + # Strip out anything we definitely don't want + full_dataset = full_dataset[~full_dataset["Installed under ECO3"]] + + areas = full_dataset[full_dataset["Suspected Needs CWI - not surveyed"] == True]["Postal Region 2"].unique() + + priorities = full_dataset[ + full_dataset["Postal Region 2"].isin(areas) + ] + + region_prevalance = priorities["Postal Region 2"].value_counts().to_frame().reset_index() + region_prevalance = region_prevalance[region_prevalance["count"] > 100] + df = priorities[priorities["Postal Region 2"].isin(region_prevalance["Postal Region 2"].values)] + + df["Postal Region"].value_counts() + df["Postal Region 2"].value_counts() + + if df["Installed under ECO3"].sum(): + raise ValueError("There are properties in the priority list that were installed under ECO3") + + df.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Non-intrusives/10022025 Non-Intrusives - " + "revised list.xlsx", + index=False + ) # We save the data locally - stonewater_cavity_properties.to_csv( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Cavity Properties - priority " - "postcodes.csv", - index=False - ) - additional_properties2.to_csv( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Additional Cavity Properties - " - "non-priority postcodes.csv", - index=False - ) - # Save the survey findings - needs_cwi.to_csv( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Properties Needing CWI - WIP.csv", - index=False - ) + # stonewater_cavity_properties.to_csv( + # "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Cavity Properties - priority " + # "postcodes.csv", + # index=False + # ) + # additional_properties2.to_csv( + # "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Additional Cavity Properties - " + # "non-priority postcodes.csv", + # index=False + # ) + # # Save the survey findings + # needs_cwi.to_csv( + # "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Properties Needing CWI - + # WIP.csv", + # index=False + # ) def cross_reference_epc_programme(): @@ -528,6 +443,12 @@ def cross_reference_epc_programme(): "SURVEYED - ECO3 NOT COMPLETED.xlsx" ) + for _, x in eco3_fallout.iterrows(): + house_no = SearchEpc.get_house_number(x["ADDRESS"], "") + if house_no is None: + house_no = x["ADDRESS"].split(",")[0] + x["house_number"] = house_no + eco3_fallout["house_number"] = eco3_fallout.apply( lambda x: SearchEpc.get_house_number(x["ADDRESS"], ""), axis=1 ) @@ -558,3 +479,58 @@ def cross_reference_epc_programme(): stonewater_modelled_above_c["Address"].apply(lambda x: fuzz.ratio(x, property["ADDRESS"]) > 90) ] match.head() + + +def finalise_list_for_non_intrusives(): + non_intrusives_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Non-intrusives/20250207 Stonewater " + "Non-Intrusives.xlsx" + ) + + # Remove anything installed under ECO3 + non_intrusives_list = non_intrusives_list[~non_intrusives_list["Installed under ECO3"]] + + # We make any properties that were surveyed by Osmosis + packages = pd.read_excel( + "/Users/khalimconn-kowlessar/Downloads/Stonewater - Bid Packages WIP 14.11.20 V2 " + "(1).xlsx", + header=13, + sheet_name="Modelled Packages" + ) + + non_intrusives_list["Surveyed by Osmosis"] = non_intrusives_list["Address ID"].isin( + packages["Address ID"].values + ) + # Removed 54 addresses + final_non_intrusives = non_intrusives_list[ + ~non_intrusives_list["Surveyed by Osmosis"] + ] + + features = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - " + "master sheet.csv", + encoding='latin1' + ) + + # Add on the orgnisaion reference + final_non_intrusives = final_non_intrusives.merge( + features[["Organisation Reference", "Address ID"]], + how="left", + on="Address ID" + ) + + final_non_intrusives["Postal Region"] = final_non_intrusives["Postcode"].str.split(" ").str[0].str[0:2] + selected_regions = final_non_intrusives[ + final_non_intrusives["Include in non-intrusives"] + ]["Postcode"].unique() + + final_non_intrusives["Is in region"] = final_non_intrusives["Postcode"].isin(selected_regions) + + # Filter down: + final_non_intrusives = final_non_intrusives[ + final_non_intrusives["Is in region"] + ] + + final_non_intrusives.to_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Non-intrusives/10022025 Non-Intrusives " + "List - final.xlsx") diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index f93a5a73..eaba1058 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -25,6 +25,7 @@ class RetrieveFindMyEpc: self.postcode = postcode self.address_cleaned = self.address.replace(",", "").replace(" ", "").lower() + self.walls = [] @staticmethod def extract_low_carbon_sources(soup): @@ -102,6 +103,8 @@ class RetrieveFindMyEpc: # 2) Bills estimates # 3) Recommendations and SAP points # 4) Low and zero carbon energy sources + # 5) The wall types of the property - used for determining if we have an extension wall insulation# + # recommendation ratings = address_res.find('desc', {'id': 'svg-desc'}).text current_rating = ratings.split(".")[0] @@ -208,6 +211,17 @@ class RetrieveFindMyEpc: if key not in assessment_data: raise ValueError(f"Missing key: {key}") + # The wall types of the property + property_features_table = address_res.find("tbody", class_="govuk-table__body") + property_features_table = property_features_table.find_all("tr") + + # Extract wall types + self.walls = [] + for row in property_features_table: + cells = row.find_all("td") + if row.find("th").text.strip() == "Wall": + self.walls.append(cells[0].text.strip()) + # Finally, we format the recommendations recommendations = self.format_recommendations(recommendations, assessment_data, sap_2012_date) @@ -229,8 +243,7 @@ class RetrieveFindMyEpc: return resulting_data - @staticmethod - def format_recommendations(recommendations, assessment_data, sap_2012_date=None): + def format_recommendations(self, recommendations, assessment_data, sap_2012_date=None): """ This function converts the recommendations to a format that we can use in the engine as a non-intrusive survey :param recommendations: The recommendations from the EPC @@ -330,6 +343,8 @@ class RetrieveFindMyEpc: for rec in recommendations: mapped = measure_map[rec["measure"]] for measure in mapped: + if measure == "cavity_wall_insulation" and "solid brick" in self.walls[0].lower(): + measure = "extension_cavity_wall_insulation" to_append = { "type": measure, "sap_points": rec["sap_points"], diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 3432b744..cc50caae 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -1,5 +1,6 @@ import os import time +import pickle import pandas as pd import numpy as np @@ -20,7 +21,7 @@ load_dotenv(dotenv_path="backend/.env") EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") -def get_data(asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map, epc_api_only=True): +def get_data(asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map, epc_api_only=False): epc_data = [] errors = [] no_epc = [] @@ -116,10 +117,14 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column, m find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() except ValueError as e: if "No EPC found" in str(e) and "address1" in searcher.newest_epc: - find_epc_searcher = RetrieveFindMyEpc( - address=searcher.newest_epc["address1"], postcode=searcher.newest_epc["postcode"] - ) - find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + try: + find_epc_searcher = RetrieveFindMyEpc( + address=searcher.newest_epc["address1"], postcode=searcher.newest_epc["postcode"] + ) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + except ValueError as e: + if "No EPC found" in str(e): + find_epc_data = {} else: find_epc_data = {} except Exception as e: @@ -176,19 +181,33 @@ def app(): Property UPRN """ - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/For Housing" - DATA_FILENAME = "For Housing Data pull.xlsx" - SHEET_NAME = "Sheet1" - POSTCODE_COLUMN = "Post Code" - FULLADDRESS_COLUMN = None - ADDRESS1_COLUMN = "NO." + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Places For People" + DATA_FILENAME = "Regulated Stock - Do Not Change (06.06.24).xlsx" + SHEET_NAME = "Assets 1" + POSTCODE_COLUMN = "Postcode" + FULLADDRESS_COLUMN = "Address" + ADDRESS1_COLUMN = "AddressLine1" ADDRESS1_METHOD = None - ADDRESS_COLS_TO_CONCAT = ["NO.", "Street / Block Name"] + ADDRESS_COLS_TO_CONCAT = [] + MISSING_POSTCODES_METHOD = None # Maps addresses to uprn in problematic cases MANUAL_UPRN_MAP = {} asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME) + + if MISSING_POSTCODES_METHOD is not None: + if MISSING_POSTCODES_METHOD == "last_two_words": + # Replace any double spaces + asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].str.replace(' ', ' ', regex=False) + asset_list["Postcode"] = np.where( + pd.isnull(asset_list["Postcode"]), + asset_list[FULLADDRESS_COLUMN].str.split(" ").str[-2:].str.join(" "), + asset_list["Postcode"] + ) + else: + raise ValueError(f"Method {MISSING_POSTCODES_METHOD} not recognized") + asset_list = asset_list[~pd.isnull(asset_list[POSTCODE_COLUMN])].reset_index() asset_list["row_id"] = asset_list.index @@ -217,29 +236,46 @@ def app(): asset_list = asset_list[~asset_list["deduper"].duplicated()] asset_list = asset_list.drop(columns=["deduper"]) - epc_data, errors, no_epc = get_data( - asset_list=asset_list, - fulladdress_column=FULLADDRESS_COLUMN, - address1_column=ADDRESS1_COLUMN, - postcode_column=POSTCODE_COLUMN, - manual_uprn_map=MANUAL_UPRN_MAP - ) + # We chunk up this data into 5000 rows at a time + chunk_size = 5000 + epc_data = [] + errors = [] + no_epc = [] + skip = None # Used to skip already completed chunks + for i in range(0, len(asset_list), chunk_size): + print(f"Processing chunk {i} to {i + chunk_size}") + if skip is not None: + if i <= skip: + continue + chunk = asset_list[i:i + chunk_size] + epc_data_chunk, errors_chunk, no_epc_chunk = get_data( + asset_list=chunk, + fulladdress_column=FULLADDRESS_COLUMN, + address1_column=ADDRESS1_COLUMN, + postcode_column=POSTCODE_COLUMN, + manual_uprn_map=MANUAL_UPRN_MAP + ) - # We now retrieve any failed properties - asset_list_failed = asset_list[asset_list["row_id"].isin(errors)] - epc_data_failed, _, _ = get_data( - asset_list=asset_list_failed, - fulladdress_column=FULLADDRESS_COLUMN, - address1_column=ADDRESS1_COLUMN, - postcode_column=POSTCODE_COLUMN, - manual_uprn_map=MANUAL_UPRN_MAP - ) + # We now retrieve any failed properties + chunk_failed = chunk[chunk["row_id"].isin(errors)] + epc_data_failed, _, _ = get_data( + asset_list=chunk_failed, + fulladdress_column=FULLADDRESS_COLUMN, + address1_column=ADDRESS1_COLUMN, + postcode_column=POSTCODE_COLUMN, + manual_uprn_map=MANUAL_UPRN_MAP, + epc_api_only=False + ) - no_data = asset_list[asset_list["row_id"].isin(no_epc)] - print(no_data[[FULLADDRESS_COLUMN, POSTCODE_COLUMN]]) + epc_data_chunk.extend(epc_data_failed) + errors.extend(errors_chunk) + no_epc.extend(no_epc_chunk) - # Append the failed data to the main data - epc_data.extend(epc_data_failed) + # Append the failed data to the main data + # Store the chunk locally as a csv + pd.DataFrame(epc_data_chunk).to_csv(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i}.csv"), index=False) + + epc_data.extend(epc_data_chunk) epc_df = pd.DataFrame(epc_data) @@ -339,7 +375,7 @@ def app(): "current-energy-efficiency": "SAP score on register", "current-energy-rating": "EPC rating on register", "property-type": "Property Type", - "built-form": "Archetype", + "built-form": "Archetype - EPC", "total-floor-area": "Property Floor Area", "construction-age-band": "Property Age Band", "floor-height": "Property Floor Height", @@ -375,7 +411,7 @@ def app(): num_floors=x["Estimated Number of Floors"], floor_height=float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5, perimeter=x["Estimated Perimeter (m)"], - built_form=x["Archetype"] + built_form=x["Archetype - EPC"] ), axis=1 ) @@ -406,3 +442,48 @@ def app(): matches_review = asset_list[ [FULLADDRESS_COLUMN, ADDRESS1_COLUMN, POSTCODE_COLUMN, "Address on EPC", "Postcode on EPC"] ] + + +import requests +import base64 + +API_KEY = "c4afe10370d67eeaa44f067dd37d115263f6c90e" +URL = "https://epc.opendatacommunities.org/api/v1/domestic/search?size=20" +email = "itskruel@gmail.com" + +AUTH_TOKEN = base64.b64encode( + ":".join([email, API_KEY]).encode("utf-8") +) + +AUTH_TOKEN = "aXRza3J1ZWxAZ21haWwuY29tOmM0YWZlMTAzNzBkNjdlZWFhNDRmMDY3ZGQzN2QxMTUyNjNmNmM5MGU=" + +headers = { + "Authorization": "Basic {auth_token}".format(auth_token=AUTH_TOKEN), + "Accept": "application/json", +} + +params = { + "UPRN": "766024370" +} + +response = requests.get(url="https://epc.opendatacommunities.org/api/v1/domestic/search?size=20&UPRN=766024370", + headers=headers) +response.json() + +data = response.json() + +from operator import itemgetter + +newest = sorted(data["rows"], key=itemgetter('lodgement-date')) +data["rows"][0]["lodgement-date"] +data["rows"][1]["lodgement-date"] + +import pandas as pd + +df = pd.DataFrame(data["rows"]) + +df["uprn"].values[2] + +df[df["uprn"] == "3455035000"]["property-type"] + +from backend.apis.GoogleSolarApi import GoogleSolarApi diff --git a/recommendations/Recommendations.py b/recommendations/Recommendations.py index 15614a0b..03e651e8 100644 --- a/recommendations/Recommendations.py +++ b/recommendations/Recommendations.py @@ -503,7 +503,9 @@ class Recommendations: impact_summary.append( { "phase": rec["phase"], + "representative": rec["recommendation_id"] in representative_ids, "recommendation_id": rec["recommendation_id"], + "measure_type": rec["measure_type"], "sap": sap + rec["sap_points"], "carbon": carbon - rec["co2_equivalent_savings"], "heat_demand": heat_demand - rec["heat_demand"], From 61544d01db865af74608e8d2e9d1ea3e9d727dde Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 12 Feb 2025 10:14:14 +0000 Subject: [PATCH 22/72] updating data pull code --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- etl/customers/remote_assessments/app.py | 10 +- .../stonewater/potential_eco_properties.py | 12 +- etl/route_march_data_pull/app.py | 322 ++++++++++++++---- 5 files changed, 274 insertions(+), 74 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index 762580d9..df6c4faa 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index c916a158..50cad4ca 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py index e1298565..f32dcea6 100644 --- a/etl/customers/remote_assessments/app.py +++ b/etl/customers/remote_assessments/app.py @@ -19,9 +19,9 @@ def app(): asset_list = [ { - "address": "49 Brailsford Road", - "postcode": "M14 6PT", - "uprn": 77145666, + "address": "19 Hillcrest Court", + "postcode": "IP21 4YJ", + "uprn": 2630134524, } ] asset_list = pd.DataFrame(asset_list) @@ -52,8 +52,8 @@ def app(): valuation_data = [ { - "uprn": 77145666, - "valuation": 337_000 + "uprn": 2630134524, + "valuation": 96_000 } ] # Store valuation data to s3 diff --git a/etl/customers/stonewater/potential_eco_properties.py b/etl/customers/stonewater/potential_eco_properties.py index eef82eae..6666ce15 100644 --- a/etl/customers/stonewater/potential_eco_properties.py +++ b/etl/customers/stonewater/potential_eco_properties.py @@ -368,9 +368,10 @@ def app(): additional_properties2 = additional_properties[[ "Address", "Postcode", "Address ID", "SAP", "SAP Band", "Property Type", "Walls", "Roofs", "Glazing", "Heating", "Main Fuel", "Hot Water", "Renewables", "Total Floor Area", 'Installed under ECO3', - 'Same Postcode as Installed under ECO3' + 'Same Postcode as Installed under ECO3', "Organisation Reference", ]].rename( columns={ + "Organisation Reference": "Org. ref.", "SAP": "Parity - Predicted SAP", "SAP Band": "Parity - Predicted SAP Band", "Age": "Parity - Build Age", @@ -387,7 +388,12 @@ def app(): ) # Combine the data: - full_dataset = pd.concat([stonewater_cavity_properties, additional_properties2]) + + stonewater_cavity_properties2 = stonewater_cavity_properties.merge( + features[["Address", "Organisation Reference"]], how="left", on="Organisation Reference" + ) + full_dataset = pd.concat([stonewater_cavity_properties2, additional_properties2]) + full_dataset = full_dataset.drop(columns=['Osm. ID']) # We not define the priority list for non-intrusives full_dataset["Postal Region"] = full_dataset["Postcode"].str.split(" ").str[0].str[0:2] @@ -414,7 +420,7 @@ def app(): df.to_csv( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Non-intrusives/10022025 Non-Intrusives - " - "revised list.xlsx", + "revised list.csv", index=False ) diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index cc50caae..dba85b3f 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -1,7 +1,6 @@ import os import time -import pickle - +from BaseUtility import Definitions import pandas as pd import numpy as np from tqdm import tqdm @@ -17,6 +16,10 @@ from recommendations.recommendation_utils import ( estimate_number_of_floors ) +from etl.epc_clean.epc_attributes.attribute_utils import ( + extract_thermal_transmittance +) + load_dotenv(dotenv_path="backend/.env") EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") @@ -158,6 +161,53 @@ def extract_address1(asset_list, full_address_col, method="first_two_words"): raise ValueError(f"Method {method} not recognized") +def process_age_band(x, year_built_column): + year_built = float(x[year_built_column]) + + if pd.isnull(x["Property Age Band"]) or ( + x["Property Age Band"] in Definitions.DATA_ANOMALY_MATCHES + ) or pd.isnull(year_built): + return "No EPC Age Band" + + # We check if we have a numeric data + if x["Property Age Band"].isdigit(): + if year_built == float(x["Property Age Band"]): + return "EPC Age Band Matches Year Built" + if year_built > float(x["Property Age Band"]): + return "EPC Age Band is older than Year Built" + if year_built < float(x["Property Age Band"]): + return "EPC Age Band is newer than Year Built" + + # Handle specific case + if x["Property Age Band"] == "England and Wales: 2007 onwards": + if year_built >= 2007: + return "EPC Age Band Matches Year Built" + if year_built < 2007: + return "EPC Age Band is older than Year Built" + + if x["Property Age Band"] == "England and Wales: before 1900": + if year_built < 1900: + return "EPC Age Band Matches Year Built" + if year_built >= 1900: + return "EPC Age Band is newer than Year Built" + + # Age band will be formatted as such: + # 'England and Wales: {upper date}-{lower date}' + # so we extract the lower and upper date + age_band = x["Property Age Band"].split(": ")[1] + lower_date, upper_date = age_band.split("-") + if year_built <= float(upper_date) and year_built <= float(upper_date): + return "EPC Age Band Matches Year Built" + + if year_built > float(upper_date): + return "EPC Age Band is older than Year Built" + + if year_built < float(upper_date): + return "EPC Age Band is newer than Year Built" + + raise Exception("Should not reach here") + + def app(): """ This app is EPC pulling data for some properties owned by Livewest @@ -179,17 +229,47 @@ def app(): Heat loss calculations EPC recommendations Property UPRN - """ - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Places For People" - DATA_FILENAME = "Regulated Stock - Do Not Change (06.06.24).xlsx" - SHEET_NAME = "Assets 1" + + # TODO: + # For cavity work: + # - Flag any entries that have a different wall type between non-intrusive data against EPC + # - Worth double checking entries that have a difference in wall construction + # - Look at anything that is flagged as an empty cavity but the EPC data says it’s a filled cavity + # - Look at the current EPC scores - Anything that is C75 or above, especially if it’s assumed no insulation + # - By postcode, we can try and deduce if all of the addresses are a flats and then estimate if 50% of the flats + # are less than C75 + # - Flag anything pre SAP2012 + # - Flag anything over 5 years old + # - Look at year built vs age band + # + # For Solar: + # - Discount any that have solar PV - based on non-intrusives and from the inspections team + # - In the heating, discount anything that isn’t ashp, ghsp, hhrs, electric storage - possibly homes with + # electric room heaters but it might need to be an EPC E + # - Fabric - check the floor, wall and roof: + # - Filled or empty cavity is good + # - Insulated solid/timber/system built is good + # - SCIS/CEG needs solid floors + # - JJC don’t care + # - Anything with a loft 200 or below + # - Anything C75 and above won’t qualify + # - Insulated loft = 200mm + # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) + # - Or the insulation required is loft/cavity (floors should be solid) + + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Eastlight" + DATA_FILENAME = "Eastlight addresses potential PV data pull required.xlsx" + SHEET_NAME = "Sheet1" POSTCODE_COLUMN = "Postcode" - FULLADDRESS_COLUMN = "Address" - ADDRESS1_COLUMN = "AddressLine1" + FULLADDRESS_COLUMN = None + ADDRESS1_COLUMN = "HouseName" ADDRESS1_METHOD = None - ADDRESS_COLS_TO_CONCAT = [] + ADDRESS_COLS_TO_CONCAT = [ + "HouseName", "Block", "Address1" + ] MISSING_POSTCODES_METHOD = None + PROPERTY_YEAR_BUILT = 'Built In Year' # Maps addresses to uprn in problematic cases MANUAL_UPRN_MAP = {} @@ -216,6 +296,7 @@ def app(): asset_list[col] = asset_list[col].astype(str) asset_list[col] = asset_list[col].str.replace('\xa0', ' ', regex=False) asset_list[col] = asset_list[col].str.replace(' ', ' ', regex=False) + asset_list[col] = asset_list[col].str.strip() if ADDRESS1_COLUMN is None: ADDRESS1_COLUMN = "address1_extracted" @@ -226,7 +307,15 @@ def app(): if FULLADDRESS_COLUMN is None: FULLADDRESS_COLUMN = "fulladdress_extracted" # We concatenate the columns in ADDRESS_COLS_TO_CONCAT, on commas - asset_list[FULLADDRESS_COLUMN] = asset_list[ADDRESS_COLS_TO_CONCAT].apply(lambda x: ", ".join(x), axis=1) + # Sometimes, some of the columns are empty, so we need to remove them + asset_list[FULLADDRESS_COLUMN] = asset_list[ADDRESS_COLS_TO_CONCAT].apply( + lambda x: ", ".join([y for y in x if not pd.isnull(y)]), axis=1 + ) + + # We clean up portential non-breaking spaces, and double spaces + asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].astype(str) + asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].str.replace('\xa0', ' ', regex=False) + asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].str.replace(' ', ' ', regex=False) # We check for duplicated addresses asset_list["deduper"] = asset_list[FULLADDRESS_COLUMN] + asset_list[POSTCODE_COLUMN] @@ -237,8 +326,10 @@ def app(): asset_list = asset_list.drop(columns=["deduper"]) # We chunk up this data into 5000 rows at a time + # Create the chunks directory + if not os.path.exists(os.path.join(DATA_FOLDER, "Chunks")): + os.makedirs(os.path.join(DATA_FOLDER, "Chunks")) chunk_size = 5000 - epc_data = [] errors = [] no_epc = [] skip = None # Used to skip already completed chunks @@ -275,9 +366,19 @@ def app(): # Store the chunk locally as a csv pd.DataFrame(epc_data_chunk).to_csv(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i}.csv"), index=False) - epc_data.extend(epc_data_chunk) + # We read in and concatenate the created created chunks + chunks_folder = os.path.join(DATA_FOLDER, "Chunks") + # List the contents + chunk_files = os.listdir(chunks_folder) + epc_data = [] + for file in chunk_files: + csv_data = pd.read_csv(os.path.join(chunks_folder, file)) + # We need to convert the recommendations back to a list + csv_data["recommendations"] = csv_data["recommendations"].apply(eval) + csv_data["find_my_epc_data"] = csv_data["find_my_epc_data"].apply(eval) + epc_data.append(csv_data) - epc_df = pd.DataFrame(epc_data) + epc_df = pd.concat(epc_data) # We expand out the recommendations recommendations_df = epc_df[["row_id", "recommendations"]] @@ -302,9 +403,9 @@ def app(): transformed_data.append(row_data) transformed_df = pd.DataFrame(transformed_data) - # Drop the column that is "" - if "" in transformed_df.columns: - transformed_df = transformed_df.drop(columns=[""]) + # At the moment, we're only using a limited set of columns - let's jut keep cavity wall insulation + # recommendations + transformed_df = transformed_df[["row_id", "Cavity wall insulation"]] # Get the find my epc data find_my_epc_data = epc_df[["row_id", "find_my_epc_data"]].drop(columns=["find_my_epc_data"]).join( @@ -342,7 +443,9 @@ def app(): "energy-consumption-current", # kwh/m2 "photo-supply", ] - ].rename(columns={"address1": "Address1 on EPC", "address": "Address on EPC", "postcode": "Postcode on EPC"}) + ].rename( + columns={"address1": "Address1 on EPC", "address": "Address on EPC", "postcode": "Postcode on EPC"} + ) asset_list = asset_list.merge( epc_df, @@ -422,6 +525,138 @@ def app(): axis=1 ) + # We produce some additional fields + # 1) Is the SAP rating below C75 + asset_list["SAP Rating is 75 and below"] = asset_list["SAP score on register"] <= 75 + # 2) Flag anything where the EPC is older than 5 years + cutoff_year = pd.Timestamp.now().year - 5 + asset_list[f"EPC is pre {cutoff_year}"] = ( + pd.to_datetime(asset_list["Date of last EPC"]).dt.year < cutoff_year + ) + + # 3) If we have year in the asset list, we flag entries where the built year is different from the + # EPC Age band + if PROPERTY_YEAR_BUILT is not None: + asset_list["Does Age Match EPC Age Band?"] = asset_list.apply( + lambda x: process_age_band(x, PROPERTY_YEAR_BUILT), axis=1 + ) + + # 4) Flag properties that look like they're good candidates for solar installs + # Firstly, flag if the fabric is completely done + + insulated_wall_substrings = [ + ", insulated", "with external insulation", "with internal insulation", "filled cavity" + ] + + insulated_roof_substrings = [ + "(another dwelling above)", "limited insulation", "(other premises above)", + ", no insulation", + ] + + def check_solar_insulation_conditions(x): + + if pd.isnull(x["Wall Construction"]): + return None + + if "average thermal transmittance" in x["Wall Construction"].lower(): + # We extract out the u-values + wall_uvalue = extract_thermal_transmittance({}, x["Wall Construction"])[0]["thermal_transmittance"] + roof_uvalue = extract_thermal_transmittance({}, x["Roof Construction"])[0]["thermal_transmittance"] + floor_uvalue = extract_thermal_transmittance({}, x["Floor Construction"])[0]["thermal_transmittance"] + + roof_uvalue = 0 if roof_uvalue is None else roof_uvalue + floor_uvalue = 0 if floor_uvalue is None else floor_uvalue + + # We apply some cutoffs + if wall_uvalue < 0.7 and roof_uvalue < 0.7 and floor_uvalue < 0.7: + return "Walls, Roof and Floor have U-values below 0.7" + + return "Confirm U-values" + + walls_insulated = any( + insulated_substring in x["Wall Construction"].lower() for insulated_substring in insulated_wall_substrings + ) + roof_is_numeric = False + if str(x["Roof Insulation Thickness"]).isdigit(): + roof_is_numeric = True + roof_insulated = int(x["Roof Insulation Thickness"]) >= 200 + else: + roof_insulated = any( + insulated_substring in x["Roof Construction"].lower() for insulated_substring in + insulated_roof_substrings + ) + + floor_is_solid = "solid" in x["Floor Construction"].lower() + + if walls_insulated and roof_insulated and floor_is_solid: + return "Walls Insulated, Roof Insulated, Floor Solid" + + if walls_insulated and floor_is_solid and roof_is_numeric: + return "Walls Insulated, Floor Solid, Loft need top-up" + + return "Not Fully Insulated or no data" + + asset_list["Solar Fabric Condition"] = asset_list.apply(check_solar_insulation_conditions, axis=1) + + asset_list["Good Solar Candidate"] = ( + asset_list["SAP Rating is 75 and below"] & + ~asset_list["Has Solar PV"] & + ( + asset_list["Heating Type"].isin( + [ + "Electric storage heaters", + "Room heaters, electric", + ] + ) | asset_list["Heating Type"].str.contains("heat pump", case=False) + ) & ( + asset_list["Solar Fabric Condition"].isin( + [ + "Walls Insulated, Roof Insulated, Floor Solid", + "Walls, Roof and Floor have U-values below 0.7", + "Walls Insulated, Floor Solid, Loft need top-up" + ] + ) + ) + ) + + def flat_analysis(asset_list): + + # We need to deduce the building name - we strip out the house number + def extract_building_name(x): + # TODO: This doesn't really work + if pd.isnull(x): + return None + house_no = SearchEpc.get_house_number(address=x, postcode=None) + if house_no: + return x.replace(house_no, "").strip() + return x.split(",")[0].strip() + + # We want to deduce if flats have 50% of the properties below C75 + # We group by postcode and property type + grouped = asset_list.groupby(["Postcode", "Property Type"]) + + flat_data = [] + for _, group in grouped: + if "flat" in group["Property Type"].str.lower().values: + num_flats = group["Property Type"].str.lower().value_counts().get("flat", 0) + num_below_c75 = group["SAP score on register"].lt(75).sum() + + flat_data.append( + { + "Postcode": group["Postcode"].iloc[0], + "Property Type": "Flat", + "Number of Flats with EPC": num_flats, + "Number of Flats below C75": num_below_c75, + "Proportion of Flat EPCs below C75": round(100 * num_below_c75 / num_flats) + } + ) + + flat_data = pd.DataFrame(flat_data) + + return flat_data + + flat_data = flat_analysis(asset_list) + # For all of the columns in transformed_df, prefix with "Recommendation: " for col in transformed_df.columns: if col == "row_id": @@ -436,54 +671,13 @@ def app(): asset_list = asset_list.drop(columns=["row_id", "index"]) # Store as an excel - filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull - Main.xlsx" - asset_list.to_excel(filename, index=False) + filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull.xlsx" + # Store the data in two tabs. One for the asset list with the EPC data and the second with the flat data + + with pd.ExcelWriter(filename) as writer: + asset_list.to_excel(writer, sheet_name="EPC Data", index=False) + flat_data.to_excel(writer, sheet_name="Flat Data", index=False) matches_review = asset_list[ [FULLADDRESS_COLUMN, ADDRESS1_COLUMN, POSTCODE_COLUMN, "Address on EPC", "Postcode on EPC"] ] - - -import requests -import base64 - -API_KEY = "c4afe10370d67eeaa44f067dd37d115263f6c90e" -URL = "https://epc.opendatacommunities.org/api/v1/domestic/search?size=20" -email = "itskruel@gmail.com" - -AUTH_TOKEN = base64.b64encode( - ":".join([email, API_KEY]).encode("utf-8") -) - -AUTH_TOKEN = "aXRza3J1ZWxAZ21haWwuY29tOmM0YWZlMTAzNzBkNjdlZWFhNDRmMDY3ZGQzN2QxMTUyNjNmNmM5MGU=" - -headers = { - "Authorization": "Basic {auth_token}".format(auth_token=AUTH_TOKEN), - "Accept": "application/json", -} - -params = { - "UPRN": "766024370" -} - -response = requests.get(url="https://epc.opendatacommunities.org/api/v1/domestic/search?size=20&UPRN=766024370", - headers=headers) -response.json() - -data = response.json() - -from operator import itemgetter - -newest = sorted(data["rows"], key=itemgetter('lodgement-date')) -data["rows"][0]["lodgement-date"] -data["rows"][1]["lodgement-date"] - -import pandas as pd - -df = pd.DataFrame(data["rows"]) - -df["uprn"].values[2] - -df[df["uprn"] == "3455035000"]["property-type"] - -from backend.apis.GoogleSolarApi import GoogleSolarApi From 959d29b675a6b8e6c57074d5a9fe5a3973ed1d96 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 12 Feb 2025 15:20:55 +0000 Subject: [PATCH 23/72] allowing optional ashp cop parameter --- backend/app/plan/router.py | 5 +++-- backend/app/plan/schemas.py | 2 ++ etl/customers/l_and_g/ic_slides.py | 5 ++++- recommendations/Recommendations.py | 25 +++++++++++++++++++------ 4 files changed, 28 insertions(+), 9 deletions(-) diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 04a2ef7f..f85ceacc 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -370,7 +370,7 @@ def extract_property_request_data( property_non_invasive_recommendations["recommendations"] = str(transformed) # Check if the valuation data has uprn - valuation_has_uprn = "uprn" in valuation_data[0] if valuation_data else True + valuation_has_uprn = "uprn" in valuation_data[0] if valuation_data else False if valuation_has_uprn: valuation_has_uprn = valuation_data[0]["uprn"] not in ["", None] @@ -692,7 +692,8 @@ async def trigger_plan(body: PlanTriggerRequest): Recommendations.calculate_recommendation_tenant_savings( property_instance=property_instance, kwh_simulation_predictions=kwh_simulation_predictions, - property_recommendations=property_recommendations + property_recommendations=property_recommendations, + ashp_cop=body.ashp_cop ) ) property_instance.current_energy_bill = property_current_energy_bill diff --git a/backend/app/plan/schemas.py b/backend/app/plan/schemas.py index f84912fe..618bec90 100644 --- a/backend/app/plan/schemas.py +++ b/backend/app/plan/schemas.py @@ -80,3 +80,5 @@ class PlanTriggerRequest(BaseModel): multi_plan: Optional[bool] = False optimise: Optional[bool] = True default_u_values: Optional[bool] = True + + ashp_cop: Optional[float] = 2.8 diff --git a/etl/customers/l_and_g/ic_slides.py b/etl/customers/l_and_g/ic_slides.py index 72dfc2c0..a5cb3511 100644 --- a/etl/customers/l_and_g/ic_slides.py +++ b/etl/customers/l_and_g/ic_slides.py @@ -132,7 +132,7 @@ def get_data(portfolio_id, scenario_ids): return properties_data, plans_data, recommendations_data -properties_data, plans_data, recommendations_data = get_data(portfolio_id=124, scenario_ids=[199]) +properties_data, plans_data, recommendations_data = get_data(portfolio_id=124, scenario_ids=[205]) properties_df = pd.DataFrame(properties_data) plans_df = pd.DataFrame(plans_data) @@ -240,4 +240,7 @@ df["Predicted Post Works SAP"] = df["Current SAP Points"] + df["sap_points"] df["Predicted Post Works SAP"] = df["Predicted Post Works SAP"].round() df["Predicted Post Works EPC"] = df["Predicted Post Works SAP"].apply(lambda x: sap_to_epc(x)) +df["Recommendation: Air Source Heat Pump"].sum() +df["Cost: Air Source Heat Pump"].sum() + df.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/L&G/Basildon Data Export - 2.csv", index=False) diff --git a/recommendations/Recommendations.py b/recommendations/Recommendations.py index 03e651e8..42f4e783 100644 --- a/recommendations/Recommendations.py +++ b/recommendations/Recommendations.py @@ -649,7 +649,9 @@ class Recommendations: return property_recommendations, impact_summary @staticmethod - def map_descriptions_to_fuel(heating_description, hotwater_description, main_fuel_description): + def map_descriptions_to_fuel( + heating_description, hotwater_description, main_fuel_description, descriptions_to_fuel_types + ): # Handle the case of community schemes if (heating_description == "Community scheme") or (hotwater_description == "Community scheme"): @@ -662,7 +664,7 @@ class Recommendations: } raise NotImplementedError("Handle this case") - mapped = assumptions.DESCRIPTIONS_TO_FUEL_TYPES[heating_description] + mapped = descriptions_to_fuel_types[heating_description] heating_fuel = mapped["fuel"] if hotwater_description in [ @@ -682,7 +684,7 @@ class Recommendations: "heating_cop": mapped["cop"], "hotwater_cop": 1 } - mapped_hotwater = assumptions.DESCRIPTIONS_TO_FUEL_TYPES[hotwater_description] + mapped_hotwater = descriptions_to_fuel_types[hotwater_description] return { "heating_fuel_type": heating_fuel, "hotwater_fuel_type": mapped_hotwater["fuel"], @@ -691,7 +693,7 @@ class Recommendations: @classmethod def calculate_recommendation_tenant_savings( - cls, property_instance, kwh_simulation_predictions, property_recommendations + cls, property_instance, kwh_simulation_predictions, property_recommendations, ashp_cop=None ): """ This method inserts the kwh savings and the bill savings that the customer will make from the recommendations @@ -703,9 +705,12 @@ class Recommendations: :param property_instance: Instance of the Property class, for the home associated to property_id :param kwh_simulation_predictions: dictionary of predictions from the model apis :param property_recommendations: dictionary of recommendations for the property + :param ashp_cop: The coefficient of performance for the air source heat pump. :return: """ + ashp_cop = ashp_cop if ashp_cop else assumptions.AVERAGE_ASHP_EFFICIENCY + kwh_impact_table = kwh_simulation_predictions["heating_kwh_predictions"][ kwh_simulation_predictions["heating_kwh_predictions"]["property_id"] == str(property_instance.id) ].merge( @@ -774,12 +779,19 @@ class Recommendations: if kwh_impact_table.loc[i, col] > previous_phase[col].max(): kwh_impact_table.loc[i, col] = previous_phase[col].max() + descriptions_to_fuel_types = assumptions.DESCRIPTIONS_TO_FUEL_TYPES + # We will the air source heat pump efficiencies + ashp_keys = [k for k in descriptions_to_fuel_types.keys() if "air source heat pump" in k.lower()] + for k in ashp_keys: + descriptions_to_fuel_types[k]["cop"] = ashp_cop + # For heating system recommendations, this could result in a fuel type change so we reflect that fuel_mapping = pd.DataFrame([ { "id": epc["id"], **cls.map_descriptions_to_fuel( - epc["mainheat-description"], epc["hotwater-description"], epc["main-fuel"] + epc["mainheat-description"], epc["hotwater-description"], epc["main-fuel"], + descriptions_to_fuel_types ) } for epc in property_instance.updated_simulation_epcs ]) @@ -793,7 +805,8 @@ class Recommendations: **cls.map_descriptions_to_fuel( property_instance.data["mainheat-description"], property_instance.data["hotwater-description"], - property_instance.data["main-fuel"] + property_instance.data["main-fuel"], + descriptions_to_fuel_types ) } ] From 6396f081c15a56dcb799db1edd64edbb89c56921 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 12 Feb 2025 16:19:52 +0000 Subject: [PATCH 24/72] stonewater extracting age --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- etl/customers/stonewater/Wave 3 Preparation.py | 7 ++++++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index df6c4faa..762580d9 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 50cad4ca..c916a158 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index b2a92e4c..24a8e9bb 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -125,6 +125,7 @@ def extract_summary_report(pdf_path): - Address """ + blah data = { "Address": None, "Postcode": None, @@ -701,6 +702,7 @@ def extract_epr(pdf_path): "Primary Energy Use (kWh/yr)": None, "Primary Energy Use Intensity (kWh/m2/yr)": None, "Number of Storeys": None, + "Main Building Age Band": None, "Fuel Bill": None, "Window Age Description": None, "Window Age Description Proportion (%)": None, @@ -779,6 +781,10 @@ def extract_epr(pdf_path): floor_area = re.search(r"Total Floor Area\s(?P\d+)\s?m2", text).group("floor_area") data["Primary Energy Use Intensity (kWh/m2/yr)"] = data["Primary Energy Use (kWh/yr)"] / int(floor_area) + # Extract age band + age_band_match = re.search(r"Building part:\s*Main\s*-\s*built in\s*(?:[A-Z]\s*)?(\d{4}-\d{4})", text) + data["Main Building Age Band"] = age_band_match.group(1) + # Extract Number of Storeys storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) data["Number of Storeys"] = int(storeys_match.group(1)) @@ -3022,7 +3028,6 @@ def revised_model(): # We now do a large pull of all of the data extracted_data = [] for survey_folder in tqdm(survey_folders): - survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder) # Check that the survey folder is actually a folder From 84d4070b490a04d0cf4fdefc20ab4aaaab1d7d05 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 12 Feb 2025 17:10:21 +0000 Subject: [PATCH 25/72] extracting from ima --- .../stonewater/Wave 3 Preparation.py | 61 ++++++++++++++++++- 1 file changed, 59 insertions(+), 2 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 24a8e9bb..e471211c 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -125,13 +125,13 @@ def extract_summary_report(pdf_path): - Address """ - blah data = { "Address": None, "Postcode": None, "Current SAP Rating": None, "Current EPC Band": None, "Fuel Bill": None, + "Main Building Age Band": None, "Number of Storeys": None, "Window Age Description": None, "Window Age Description Proportion (%)": None, @@ -181,6 +181,10 @@ def extract_summary_report(pdf_path): sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text) data["Current SAP Rating"] = sap_match.group(1).split(" ")[1] + # Extract age + age_band_match = re.search(r"3\.0 Date Built:\s*Main Property\s*[A-Z]?\s*(\d{4}-\d{4})", text) + data["Main Building Age Band"] = age_band_match.group(1) + # Number of storeys storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) data["Number of Storeys"] = int(storeys_match.group(1)) @@ -3027,6 +3031,7 @@ def revised_model(): # We now do a large pull of all of the data extracted_data = [] + mtp_extracted_data = [] # Additional data to extract from the medium term plans for survey_folder in tqdm(survey_folders): survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder) @@ -3048,6 +3053,58 @@ def revised_model(): None ) + mtp_folder = next( + (name for name in survey_subfolders if "mid-term" in name.lower() or "mtp" in name.lower()), + None + ) + if mtp_folder: + # We have a mid term plan: + mtp_folder_path = os.path.join(survey_folder_path, mtp_folder) + # Get the contents - files and not folder + mtp_contents = [ + os.path.join(mtp_folder, file) for file in os.listdir(mtp_folder_path) + if ".DS_Store" not in file and not os.path.isdir(os.path.join(mtp_folder_path, mtp_folder, file)) + ] + # We check the the IMA + for file_name in mtp_contents: + filepath = os.path.join(survey_folder_path, file_name) + # We expect a pdf so try and parse it + try: + with open(filepath, "rb") as file: + reader = PyPDF2.PdfReader(file) + # Just the first page + text = reader.pages[0].extract_text() + + except Exception as e: + continue + + # We check if this is an IMA + ima_heading_search = re.search( + r"Improvement measure\s+Capital Cost\s+Lifetime of\s*\n\s*measureFuel saving\s*Lifetime fuel", text + ) + + is_ima = bool(ima_heading_search) + if not is_ima: + continue + + # Otherwise, extract: RIR, PV + pv_search = re.search(r"PV \(\d+Kwp\)", text) + has_pv = bool(pv_search) + pv_system = pv_search.group(0) if has_pv else None + + rir_search = re.search(r"RIR \(\d+(\.\d+)?\)", text) + has_rir = bool(rir_search) + rir_spec = rir_search.group(0) if has_rir else None + + mtp_extracted_data.append({ + "survey_folder": survey_folder, + "has_pv": has_pv, + "PV System": pv_system, + "RIR Specification": rir_spec, + "has_rir": has_rir + }) + continue + # If retrofit assessment folder exists, check if it has content if retrofit_folder or ra_folder: if retrofit_folder: @@ -3094,7 +3151,7 @@ def revised_model(): retrofit_assessment_data = pd.DataFrame(extracted_data) # retrofit_assessment_data.to_csv( - # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 2.csv"), index=False + # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 3.csv"), index=False # ) retrofit_assessment_data = pd.read_csv( os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 2.csv"), From 711db3f552e958128faeb49a22073e5461dbc4f6 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 13 Feb 2025 07:59:12 +0000 Subject: [PATCH 26/72] adding v1 extraction to stonewater --- .../stonewater/Wave 3 Preparation.py | 53 +++++++++++++++++-- 1 file changed, 48 insertions(+), 5 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index e471211c..12158671 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -182,7 +182,10 @@ def extract_summary_report(pdf_path): data["Current SAP Rating"] = sap_match.group(1).split(" ")[1] # Extract age - age_band_match = re.search(r"3\.0 Date Built:\s*Main Property\s*[A-Z]?\s*(\d{4}-\d{4})", text) + age_band_match = re.search( + r"3\.0 Date Built:\s*Main Property\s*[A-Z]?\s*(\d{4}-\d{4}|before \d{4}|\d{4} onwards)", + text + ) data["Main Building Age Band"] = age_band_match.group(1) # Number of storeys @@ -786,7 +789,11 @@ def extract_epr(pdf_path): data["Primary Energy Use Intensity (kWh/m2/yr)"] = data["Primary Energy Use (kWh/yr)"] / int(floor_area) # Extract age band - age_band_match = re.search(r"Building part:\s*Main\s*-\s*built in\s*(?:[A-Z]\s*)?(\d{4}-\d{4})", text) + age_band_match = re.search( + r"Building part:\s*Main\s*-\s*built in\s*(?:[A-Z]\s*)?(\d{4}-\d{4}|before \d{4}|\d{4} onwards)", + text + ) + data["Main Building Age Band"] = age_band_match.group(1) # Extract Number of Storeys @@ -3065,8 +3072,21 @@ def revised_model(): os.path.join(mtp_folder, file) for file in os.listdir(mtp_folder_path) if ".DS_Store" not in file and not os.path.isdir(os.path.join(mtp_folder_path, mtp_folder, file)) ] + + has_v1 = [ + f for f in mtp_contents if "v1" in f.lower() or "/ss" in f.lower() + ] + + if has_v1: + # Then we go one level deeper + mtp_contents = [ + os.path.join(has_v1[0], f) for f in + os.listdir(os.path.join(survey_folder_path, has_v1[0])) + ] + # We check the the IMA for file_name in mtp_contents: + filepath = os.path.join(survey_folder_path, file_name) # We expect a pdf so try and parse it try: @@ -3092,6 +3112,12 @@ def revised_model(): has_pv = bool(pv_search) pv_system = pv_search.group(0) if has_pv else None + # We perform a second search for PV: + if pv_search is None: + pv_search = re.search("solar pv", text.lower()) + has_pv = bool(pv_search) + pv_system = "Solar PV" if has_pv else None + rir_search = re.search(r"RIR \(\d+(\.\d+)?\)", text) has_rir = bool(rir_search) rir_spec = rir_search.group(0) if has_rir else None @@ -3149,12 +3175,20 @@ def revised_model(): extracted_data.append(summary_data) retrofit_assessment_data = pd.DataFrame(extracted_data) + mtp_df = pd.DataFrame(mtp_extracted_data) + # Save # retrofit_assessment_data.to_csv( # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 3.csv"), index=False # ) + # mtp_df.to_csv( + # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 3.csv"), index=False + # ) retrofit_assessment_data = pd.read_csv( - os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 2.csv"), + os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 3.csv"), + ) + mtp_df = pd.read_csv( + os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 3.csv"), ) # Remove some definite duplicates @@ -3164,6 +3198,9 @@ def revised_model(): # Get all of the folders that end with ROSS to_drop = dupes[dupes["survey_folder"].str.endswith("ROSS")]["survey_folder"].unique().tolist() + # Replace \n with "" + retrofit_assessment_data["Postcode"] = retrofit_assessment_data["Postcode"].str.replace("\n", "") + retrofit_assessment_data = retrofit_assessment_data[ ~retrofit_assessment_data["survey_folder"].isin( [ @@ -3173,8 +3210,6 @@ def revised_model(): ] + to_drop ) ] - # Replace \n with "" - retrofit_assessment_data["Postcode"] = retrofit_assessment_data["Postcode"].str.replace("\n", "") retrofit_assessments_data_columns = [ 'Current SAP Rating', 'Current EPC Band', 'Primary Energy Use (kWh/yr)', @@ -3685,9 +3720,17 @@ def revised_model(): if not missed_asset_id.empty: raise Exception("Missing Asset ID") + # We merge the mpt data on to the wates coordination + wates_coordination = wates_coordination.merge( + mtp_df, how="left", on="survey_folder" + ) + ccs_coordination = ccs_coordination.merge( ccs_matching_lookup, how="left", on="Name" ) + ccs_coordination = ccs_coordination.merge( + mtp_df, how="left", on="survey_folder" + ) retrofit_packages_board = retrofit_packages_board.merge( matching_lookup, how="left", on="Name" From b8a094106c7a8ff7260648ba18d8d48b8f8715e1 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 13 Feb 2025 17:28:47 +0000 Subject: [PATCH 27/72] updating stonewater --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- etl/customers/remote_assessments/app.py | 12 ++-- .../stonewater/Wave 3 Preparation.py | 72 ++++++++++++------- etl/customers/stonewater/data_cleaning.py | 59 ++++++++------- 5 files changed, 89 insertions(+), 58 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index 762580d9..df6c4faa 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index c916a158..50cad4ca 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py index f32dcea6..70ceb76d 100644 --- a/etl/customers/remote_assessments/app.py +++ b/etl/customers/remote_assessments/app.py @@ -4,7 +4,7 @@ from dotenv import load_dotenv from utils.s3 import save_csv_to_s3 from etl.find_my_epc.AssetListEpcData import AssetListEpcData -PORTFOLIO_ID = 127 +PORTFOLIO_ID = 128 USER_ID = 8 load_dotenv(dotenv_path="backend/.env") @@ -19,9 +19,9 @@ def app(): asset_list = [ { - "address": "19 Hillcrest Court", - "postcode": "IP21 4YJ", - "uprn": 2630134524, + "address": "46", + "postcode": "BS6 7BD", + "uprn": 61091, } ] asset_list = pd.DataFrame(asset_list) @@ -52,8 +52,8 @@ def app(): valuation_data = [ { - "uprn": 2630134524, - "valuation": 96_000 + "uprn": 61091, + "valuation": 897_000 } ] # Store valuation data to s3 diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 12158671..94904aae 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3028,11 +3028,12 @@ def revised_model(): "10. Little Island", "11. CCS Dorset" ] + wave_21_folder_name = "Wave 2.1 Surveys - 2" for wave_2_1_folder in wave_21_folders: - folder_path = os.path.join(CUSTOMER_FOLDER_PATH, "Wave 2.1 Surveys", wave_2_1_folder) + folder_path = os.path.join(CUSTOMER_FOLDER_PATH, wave_21_folder_name, wave_2_1_folder) if os.path.isdir(folder_path): # Check if folder exists - folder_contents = [os.path.join("Wave 2.1 Surveys", wave_2_1_folder, file) for file in + folder_contents = [os.path.join(wave_21_folder_name, wave_2_1_folder, file) for file in os.listdir(folder_path)] survey_folders.extend(folder_contents) # Append contents to the master list @@ -3179,18 +3180,32 @@ def revised_model(): # Save # retrofit_assessment_data.to_csv( - # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 3.csv"), index=False + # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 5.csv"), index=False # ) # mtp_df.to_csv( - # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 3.csv"), index=False + # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 5.csv"), index=False # ) retrofit_assessment_data = pd.read_csv( - os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 3.csv"), + os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 5.csv"), ) mtp_df = pd.read_csv( - os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 3.csv"), + os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 5.csv"), ) + # There are a few duplicates we just manually drop + mtp_df = mtp_df.drop_duplicates() + mtp_df = mtp_df[ + ~(( + mtp_df["survey_folder"] == "Wave 2.1 Surveys - 2/1. Herefordshire/(043) Manor Fields 27" + ) & (~mtp_df["has_pv"])) + ] + + mtp_df = mtp_df[ + ~(( + mtp_df["survey_folder"] == "Wave 2.1 Surveys - 2/2. Bedfordshire/(147) Gilpin Close 5" + ) & (~mtp_df["has_pv"])) + ] + # Remove some definite duplicates dupes = retrofit_assessment_data[retrofit_assessment_data["Address"].duplicated()]["Address"] dupes = retrofit_assessment_data[retrofit_assessment_data["Address"].isin(dupes)] @@ -3487,7 +3502,7 @@ def revised_model(): ccs_coordination = ccs_coordination[ccs_coordination["Retrofit Assessment"] != "Outstanding"] ccs_manual_filters = { - "35 Kittiwake Close": "Wave 2.1 Surveys/11. CCS Dorset/Kittiwake Close 35" + "35 Kittiwake Close": f"{wave_21_folder_name}/11. CCS Dorset/Kittiwake Close 35" } ccs_matching_lookup = [] for _, home in tqdm(ccs_coordination.iterrows(), total=len(ccs_coordination)): @@ -3583,13 +3598,13 @@ def revised_model(): ] wates_manual_filters = { - "24 Rabley Wood View": "Wave 2.1 Surveys/3. Wiltshire/24-25 Rabley Wood View", - "14 Edencroft": "Wave 2.1 Surveys/3. Wiltshire/14 Edencroft", - "Flat 31 Rabley Wood View": "Wave 2.1 Surveys/3. Wiltshire/Flat 31 Rabley Wood View", - 'Flat 13, Manor Fields': 'Wave 2.1 Surveys/1. Herefordshire/(038) Manor Fields Flat 13', - "4 Kittys Lane": "Wave 2.1 Surveys/1. Herefordshire/(005) Kittys Lane 4", - '1 Jephson Court': 'Wave 2.1 Surveys/5. Coventry/Jesphson Court 1', - '2 Jephson Court': 'Wave 2.1 Surveys/5. Coventry/Jesphson Court 2', + "24 Rabley Wood View": f"{wave_21_folder_name}/3. Wiltshire/24-25 Rabley Wood View", + "14 Edencroft": f"{wave_21_folder_name}/3. Wiltshire/14 Edencroft", + "Flat 31 Rabley Wood View": f"{wave_21_folder_name}/3. Wiltshire/Flat 31 Rabley Wood View", + 'Flat 13, Manor Fields': f'{wave_21_folder_name}/1. Herefordshire/(038) Manor Fields Flat 13', + "4 Kittys Lane": f"{wave_21_folder_name}/1. Herefordshire/(005) Kittys Lane 4", + '1 Jephson Court': f'{wave_21_folder_name}/5. Coventry/Jesphson Court 1', + '2 Jephson Court': f'{wave_21_folder_name}/5. Coventry/Jesphson Court 2', } wates_matching_lookup = [] # Examples to skip when we cannot get the data @@ -3720,6 +3735,9 @@ def revised_model(): if not missed_asset_id.empty: raise Exception("Missing Asset ID") + if wates_coordination["Asset ID_x"].duplicated().sum(): + raise Exception("Duplicated IDs in wates") + # We merge the mpt data on to the wates coordination wates_coordination = wates_coordination.merge( mtp_df, how="left", on="survey_folder" @@ -3839,29 +3857,31 @@ def revised_model(): def find_nearest_matching_property(coordinated_packages, home): filter_levels = [ - ["Postcode", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], - ["Postal Region", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], - ["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], - ["Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], - ["Primary Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], - ["Primary Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], + (["Postcode", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 1), + (["Postal Region", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 2), + (["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 3), + (["Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 4), + (["Primary Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 5), + (["Primary Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 6), ] - for i, filters in enumerate(filter_levels): + max_confidence = max([confidence for (_, confidence) in filter_levels]) + + for i, (filters, match_confidence) in enumerate(filter_levels): match = coordinated_packages.copy() for col in filters: match = match[match[col] == home[col]] if not match.empty: - return match + return match, match_confidence # Finally, we search for a property in the same Archetype match = coordinated_packages[coordinated_packages["Archetype ID"] == home["Archetype ID"]] if not match.empty: - return match + return match, max_confidence + 1 - return None # No match found + return None, None # No match found coordinated_packages["Postal Region"] = coordinated_packages["Postcode"].str.split(" ").str[0].str.strip() new_priority_postcodes["Postal Region"] = new_priority_postcodes["Postcode"].str.split(" ").str[0].str.strip() @@ -3896,8 +3916,8 @@ def revised_model(): ] matches.extend(to_extend) continue - - closest_match = find_nearest_matching_property(coordinated_packages, home) + blah + closest_match, match_confidence = find_nearest_matching_property(coordinated_packages, home) if closest_match is None: no_match.append(home["Organisation Reference"]) continue diff --git a/etl/customers/stonewater/data_cleaning.py b/etl/customers/stonewater/data_cleaning.py index 010902ce..a5da0c79 100644 --- a/etl/customers/stonewater/data_cleaning.py +++ b/etl/customers/stonewater/data_cleaning.py @@ -86,8 +86,14 @@ def download_data_from_sharepoint(): folder_path="Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders" ) + folders_to_keep = [ + "1. Herefordshire", "2. Bedfordshire", "3. Wiltshire", "4. Bournemouth", + "5. Coventry", "6. West Sussex", "7. Dorset", "8. Cambridgeshire", + "9. Guildford", "10. Little Island", "11. CCS Dorset", + ] + folders_to_pull = [ - folder for folder in contents["value"] if folder["name"] in ["3. Wiltshire", "4. Bournemouth", "5. Coventry"] + folder for folder in contents["value"] if folder["name"] in folders_to_keep ] for folder_to_pull in folders_to_pull: # Get the contents @@ -109,35 +115,40 @@ def download_data_from_sharepoint(): ) if not property_folder_contents.get("value"): continue - # We look for the retrofit assessment folder: + # We look for the retrofit assessment folder or mtp folders: property_sub_folders = [ - f for f in property_folder_contents["value"] if "ra coordinator info" in f["name"].lower() + f for f in property_folder_contents["value"] if + "ra coordinator info" in f["name"].lower() or + "retrofit assessment" in f["name"].lower() or + "ra info" in f["name"].lower() or + "mtp" in f["name"].lower() or + "mid-term" in f["name"].lower() ] if not property_sub_folders: continue - # if we have this, we download the folder and store it on my laptop! - property_sub_folder = property_sub_folders[0] + for property_sub_folder in property_sub_folders: + # if we have this, we download the folder and store it on my laptop! - property_folder_path = os.path.join( - "Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders", - folder_to_pull["name"], - property_folder["name"], - property_sub_folder["name"] - ) + property_folder_path = os.path.join( + "Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders", + folder_to_pull["name"], + property_folder["name"], + property_sub_folder["name"] + ) - download_dir = os.path.join( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 2.1 Surveys", - folder_to_pull["name"], - property_folder["name"], - property_sub_folder["name"] - ) + download_dir = os.path.join( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 2.1 Surveys - 2", + folder_to_pull["name"], + property_folder["name"], + property_sub_folder["name"] + ) - # We download the folder - sharepoint_client.download_sharepoint_folder( - drive_id=sharepoint_client.document_drive["id"], - folder_path=property_folder_path, - download_dir=download_dir, - excluded_file_types=["MOV", "jpg"] - ) + # We download the folder + sharepoint_client.download_sharepoint_folder( + drive_id=sharepoint_client.document_drive["id"], + folder_path=property_folder_path, + download_dir=download_dir, + excluded_file_types=["MOV", "jpg"] + ) From bd131a2f663056fb46a906d8f148b2bcc06cd871 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 13 Feb 2025 22:32:31 +0000 Subject: [PATCH 28/72] preparing outputs for stonewater --- .../stonewater/Wave 3 Preparation.py | 77 +++++++++++++++---- 1 file changed, 62 insertions(+), 15 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 94904aae..50dadcaf 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -2984,6 +2984,8 @@ def revised_model(): original_archetypes["Address ID"] = original_archetypes["Address ID"].astype(int) original_archetypes["UPRN"] = original_archetypes["UPRN"].astype("Int64").astype(str) + wave_21_folder_name = "Wave 2.1 Surveys - 2" + # Check if we have all of the addresses missed = original_archetypes[ ~original_archetypes["Address ID"].isin(new_priority_postcodes["Address ID"].values) @@ -3028,7 +3030,6 @@ def revised_model(): "10. Little Island", "11. CCS Dorset" ] - wave_21_folder_name = "Wave 2.1 Surveys - 2" for wave_2_1_folder in wave_21_folders: folder_path = os.path.join(CUSTOMER_FOLDER_PATH, wave_21_folder_name, wave_2_1_folder) @@ -3252,7 +3253,9 @@ def revised_model(): 'Main Wall Thickness', 'Main Building Alternative Wall Type', 'Main Building Alternative Wall Insulation', 'Main Building Alternative Wall Dry-lining', - 'Main Building Alternative Wall Thickness', 'Main Fuel' + 'Main Building Alternative Wall Thickness', + 'Main Fuel', + 'Main Building Age Band', ] # For the columns in retrofit_assessments_data_columns, prefix all of them with Survey: retrofit_assessments_data_columns_prefixed = ["Survey: " + x for x in retrofit_assessments_data_columns] @@ -3795,7 +3798,8 @@ def revised_model(): "Name", "Postcode", 'SAP Band Pre', 'SAP Rating Pre', 'SAP Rating Install Package', 'SAP Band Install Package', 'Package Approved (Client)', 'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade', - 'Ventilation', 'Heating', 'Other Measures', "Asset ID.1_y", + 'Ventilation', 'Heating', 'Other Measures', 'PV System', + "Asset ID.1_y", ] + retrofit_assessments_data_columns_prefixed ].rename( columns={ @@ -3811,6 +3815,7 @@ def revised_model(): 'Heating': 'Main Heating', 'Other Measures': 'Other measures', 'Asset ID.1_y': 'Organisation Reference', + "PV System": "Solar PV", } ), wates_coordination[ @@ -3818,8 +3823,7 @@ def revised_model(): "Name", "Postcode", 'SAP Band Pre', 'SAP Rating Pre', 'SAP Rating Install Package', 'SAP Band Install Package', 'Package Approved (Client)', 'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade', - 'Ventilation', 'Heating', 'Other Measures', 'Asset ID_x' - + 'Ventilation', 'Heating', 'Other Measures', 'Asset ID_x', "PV System" ] + retrofit_assessments_data_columns_prefixed ].rename( columns={ @@ -3835,6 +3839,7 @@ def revised_model(): 'Heating': 'Main Heating', 'Other Measures': 'Other measures', 'Asset ID_x': 'Organisation Reference', + "PV System": "Solar PV", } ) ] @@ -3857,12 +3862,12 @@ def revised_model(): def find_nearest_matching_property(coordinated_packages, home): filter_levels = [ - (["Postcode", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 1), - (["Postal Region", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 2), - (["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 3), - (["Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 4), - (["Primary Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 5), - (["Primary Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 6), + (["Postcode", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 2), + (["Postal Region", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 3), + (["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 4), + (["Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 5), + (["Primary Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 6), + (["Primary Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 7), ] max_confidence = max([confidence for (_, confidence) in filter_levels]) @@ -3911,12 +3916,13 @@ def revised_model(): { "Organisation Reference": home["Organisation Reference"], "Best Match Organisation Reference": m, + "match_confidence": 1, "Was Surveyed": True } for m in survey_result["Organisation Reference"].values ] matches.extend(to_extend) continue - blah + closest_match, match_confidence = find_nearest_matching_property(coordinated_packages, home) if closest_match is None: no_match.append(home["Organisation Reference"]) @@ -3926,6 +3932,7 @@ def revised_model(): { "Organisation Reference": home["Organisation Reference"], "Best Match Organisation Reference": m, + "match_confidence": match_confidence, "Was Surveyed": False } for m in closest_match["Organisation Reference"].values ] @@ -3953,10 +3960,29 @@ def revised_model(): suffixes=("", " - Closest Match") ) + measures_columns = [ + 'Main Wall Insulation', 'Secondary Wall Insulation', 'Loft insulation', + 'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade', + 'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls', + 'Solar PV', 'Other measures' + ] + # We want to aggregate the matches, when we have multiple aggregated_matches_df = [] for org_ref, mapped_matches in matches_df.groupby("Organisation Reference"): + + measures = coordinated_packages[ + ( + coordinated_packages["Organisation Reference"].isin( + mapped_matches['Best Match Organisation Reference'].values + ) + ) + ][measures_columns] + if mapped_matches.shape[0] == 1: + # Get the measures for this property + measures = measures.squeeze() + aggregated_matches_df.append( { "Organisation Reference": org_ref, @@ -3965,6 +3991,7 @@ def revised_model(): "Estimated SAP Rating": mapped_matches["Survey: Current SAP Rating"].values[0], "Estimated EPC Rating": mapped_matches["Survey: Current EPC Band"].values[0], "Was Surveyed": mapped_matches["Was Surveyed"].values[0], + **measures } ) continue @@ -3978,6 +4005,17 @@ def revised_model(): mapped_matches[mapped_matches["Survey: Current EPC Band"] == average_epc_rating].shape[ 0] / number_of_matches * 100 ) + + measures_aggregated = {} + for m in measures_columns: + if any(~pd.isnull(measures[m])): + # Check if we have 2 unique values + vals = measures[~pd.isnull(measures[m])][m].unique() + if len(vals) > 1: + measures_aggregated[m] = ", ".join(vals) + else: + measures_aggregated[m] = vals[0] + aggregated_matches_df.append( { "Organisation Reference": org_ref, @@ -3985,7 +4023,8 @@ def revised_model(): "Proportion": proportion_with_this_epc, "Estimated SAP Rating": average_rating, "Estimated EPC Rating": average_epc_rating, - "Was Surveyed": False + "Was Surveyed": False, + **measures_aggregated } ) @@ -4002,7 +4041,6 @@ def revised_model(): def remove_leading_zero(address): return re.sub(r"^0([1-9]) ", r"\1 ", address) - # Example usage mapped_priority_list["address1"] = mapped_priority_list["address1"].apply(remove_leading_zero) mapped_priority_list["address1"] = np.where( mapped_priority_list["Organisation Reference"] == 37004, @@ -4020,6 +4058,13 @@ def revised_model(): ) mapped_priority_list["row_id"] = mapped_priority_list["Organisation Reference"] + # Flag where 2 out of the three columns have consensus + mapped_priority_list["2 of 3 Data Sources Have Consensus on EPC"] = ( + (mapped_priority_list["SAP Band"] == mapped_priority_list["EPC Band"]) | + (mapped_priority_list["SAP Band"] == mapped_priority_list["Estimated EPC Rating"]) | + (mapped_priority_list["EPC Band"] == mapped_priority_list["Estimated EPC Rating"]) + ) + # Let's get the newest EPC data for these properties # We merge on UPRN, when we have it # from etl.route_march_data_pull.app import get_data @@ -4081,6 +4126,7 @@ def revised_model(): 'Survey: Main Building Alternative Wall Dry-lining', 'Survey: Main Building Alternative Wall Thickness', 'Survey: Main Fuel', + 'Survey: Main Building Age Band', 'Walls', 'Roofs', 'Heating', 'Main Fuel', 'Age', 'Property Type' ] ].rename( @@ -4133,7 +4179,8 @@ def revised_model(): [ "Organisation Reference", 'Survey: Main Wall Type', 'Survey: Main Wall Insulation', 'Survey: Main Roof Type', 'Survey: Main Roof Insulation', 'Survey: Main Roof Insulation Thickness', - 'Survey: Existing Primary Heating System', + 'Survey: Existing Primary Heating System', 'Survey: Main Building Age Band', + 'Survey: Main Building Wall Area (m2)', ] ].rename( columns={ From 846cd99631923224d4ba8d776bdeaed35b08884a Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 15 Feb 2025 16:05:57 +0000 Subject: [PATCH 29/72] switch off solar PV if property is listed/heritage or in a conservation area --- backend/Property.py | 5 ++++ backend/app/plan/router.py | 6 ++-- etl/customers/lambeth/re-knocks.py | 23 +++++++++++++++ .../stonewater/Wave 3 Preparation.py | 28 +++++++++++++------ etl/route_march_data_pull/app.py | 22 +++++++-------- 5 files changed, 62 insertions(+), 22 deletions(-) create mode 100644 etl/customers/lambeth/re-knocks.py diff --git a/backend/Property.py b/backend/Property.py index a495431f..e19970eb 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -395,6 +395,7 @@ class Property: primary_recommendation_id=rec["recommendation_id"], non_invasive_recommendations=self.non_invasive_recommendations, ) + self.recommendations_scoring_data.append(scoring_dict) simulation_epc = self.epc_record.prepared_epc.copy() @@ -1258,6 +1259,10 @@ class Property: if (self.building_id is not None) and (self.solar_panel_configuration is not None): return True + # If the property is in a conservation area, don't recommend + if self.restricted_measures: + return False + is_valid_property_type = self.data["property-type"] in ["House", "Bungalow", "Maisonette"] is_valid_roof_type = ( self.roof["is_flat"] or self.roof["is_pitched"] or self.roof["is_roof_room"] diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index f85ceacc..949c8e4c 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -639,8 +639,10 @@ async def trigger_plan(body: PlanTriggerRequest): recommendations_scoring_data = pd.DataFrame(recommendations_scoring_data) recommendations_scoring_data = recommendations_scoring_data.drop( - columns=["rdsap_change", "heat_demand_change", "carbon_change", "sap_ending", "heat_demand_ending", - "carbon_ending"] + columns=[ + "rdsap_change", "heat_demand_change", "carbon_change", "sap_ending", "heat_demand_ending", + "carbon_ending" + ] ) all_predictions = await model_api.async_paginated_predictions( diff --git a/etl/customers/lambeth/re-knocks.py b/etl/customers/lambeth/re-knocks.py new file mode 100644 index 00000000..1de91b50 --- /dev/null +++ b/etl/customers/lambeth/re-knocks.py @@ -0,0 +1,23 @@ +import pandas as pd + +data = pd.read_excel( + "/Users/khalimconn-kowlessar/Downloads/Lambeth Reknocks.xlsx", sheet_name="Possible Route", + header=1 +) + +data["Outcomes"].value_counts() + +# Strip out: No + +df = data[data["Outcomes"] == "See notes"] +notes_df = df[ + ("Notes (If 'no answer' under outcomes, have you checked around the property for access issues where " + "possible?)")].value_counts().to_frame() + +example = df[df["Notes (If 'no answer' under outcomes, have you checked around the property for access issues where " + "possible?)"] == ('Access to rear of property only through number 10. Overgrown athe rear of property ' + 'installer wont be able to access') + ] + +# 18 did not attend +# diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 50dadcaf..95fe4fcd 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -4093,7 +4093,9 @@ def revised_model(): 'Main Wall Insulation', 'Secondary Wall Insulation', 'Loft insulation', 'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade', 'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls', - 'Solar PV', 'Other measures', 'Survey: Current SAP Rating', 'Survey: Current EPC Band', + 'Solar PV', 'Other measures', + 'Survey: Current SAP Rating', + 'Survey: Current EPC Band', 'Survey: Primary Energy Use (kWh/yr)', 'Survey: Primary Energy Use Intensity (kWh/m2/yr)', 'Survey: Number of Storeys', 'Survey: Fuel Bill', @@ -4148,7 +4150,8 @@ def revised_model(): 'Best Match Organisation Reference', 'Survey: Current EPC Band', 'Survey: Current SAP Rating', - "Was Surveyed" + "Was Surveyed", + "match_confidence", ] ].rename( columns={ @@ -4157,11 +4160,13 @@ def revised_model(): 'Survey: Current SAP Rating': "Best Match - Survey: Current SAp Rating" } ).merge( - features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type"]], + features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type", + "Total Floor Area"]], how="left", on="Organisation Reference" ).merge( - features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type"]].rename( + features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type", + "Total Floor Area"]].rename( columns={ "Organisation Reference": "Best Match - Organisation Reference", "Walls": "Best Match - Walls", @@ -4169,7 +4174,8 @@ def revised_model(): "Heating": "Best Match - Heating", "Main Fuel": "Best Match - Main Fuel", "Age": "Best Match - Age", - "Property Type": "Best Match - Property Type" + "Property Type": "Best Match - Property Type", + "Total Floor Area": "Best Match - Total Floor Area" } ), how="left", @@ -4180,7 +4186,8 @@ def revised_model(): "Organisation Reference", 'Survey: Main Wall Type', 'Survey: Main Wall Insulation', 'Survey: Main Roof Type', 'Survey: Main Roof Insulation', 'Survey: Main Roof Insulation Thickness', 'Survey: Existing Primary Heating System', 'Survey: Main Building Age Band', - 'Survey: Main Building Wall Area (m2)', + 'Survey: Main Building Wall Area (m2)', 'Survey: Total Floor Area (m2)', + 'Survey: Main Building Age Band', ] ].rename( columns={ @@ -4203,7 +4210,12 @@ def revised_model(): 'Organisation Reference', 'Address', 'Postcode', 'Address ID', 'uprn', 'Archetype ID', 'SAP', 'SAP Band', "Property Type", "Walls", "Roofs", 'Glazing', 'Heating', 'Main Fuel', 'Hot Water', 'Number of matches', 'Proportion', - 'Estimated SAP Rating', 'Estimated EPC Rating', "Was Surveyed" + 'Estimated SAP Rating', 'Estimated EPC Rating', "Was Surveyed", + 'Main Wall Insulation', + 'Secondary Wall Insulation', 'Loft insulation', 'Flat Roof', + 'Room in Roof', 'Window Upgrade', 'Door Upgrade', 'Ventilation', + 'Main Heating', 'Water Heating', 'Heating Controls', 'Solar PV', + 'Other measures', "2 of 3 Data Sources Have Consensus on EPC" ] ].rename( columns={ @@ -4271,7 +4283,7 @@ def revised_model(): worksheet = worksheet.drop(columns=["Last EPC - uprn"]) # Save to Excel with multiple sheets - excel_path = os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "04022025 Stonewater Priority List.xlsx") + excel_path = os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "13022025 Stonewater Priority List.xlsx") with pd.ExcelWriter(excel_path, engine="xlsxwriter") as writer: worksheet.to_excel(writer, sheet_name="Worksheet", index=False, header=True) mapped_lookup.to_excel(writer, sheet_name="Lookup Table", index=False, header=True) diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index dba85b3f..1b937b2d 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -258,18 +258,16 @@ def app(): # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) # - Or the insulation required is loft/cavity (floors should be solid) - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Eastlight" - DATA_FILENAME = "Eastlight addresses potential PV data pull required.xlsx" - SHEET_NAME = "Sheet1" - POSTCODE_COLUMN = "Postcode" - FULLADDRESS_COLUMN = None - ADDRESS1_COLUMN = "HouseName" + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater" + DATA_FILENAME = "Stonewater All Props for EPC Check 10.02.25.xlsx" + SHEET_NAME = "stonewater sap, insta" + POSTCODE_COLUMN = "Post Code" + FULLADDRESS_COLUMN = "Name" + ADDRESS1_COLUMN = "Name" ADDRESS1_METHOD = None - ADDRESS_COLS_TO_CONCAT = [ - "HouseName", "Block", "Address1" - ] + ADDRESS_COLS_TO_CONCAT = [] MISSING_POSTCODES_METHOD = None - PROPERTY_YEAR_BUILT = 'Built In Year' + PROPERTY_YEAR_BUILT = None # Maps addresses to uprn in problematic cases MANUAL_UPRN_MAP = {} @@ -633,7 +631,7 @@ def app(): # We want to deduce if flats have 50% of the properties below C75 # We group by postcode and property type - grouped = asset_list.groupby(["Postcode", "Property Type"]) + grouped = asset_list.groupby([POSTCODE_COLUMN, "Property Type"]) flat_data = [] for _, group in grouped: @@ -643,7 +641,7 @@ def app(): flat_data.append( { - "Postcode": group["Postcode"].iloc[0], + "Postcode": group[POSTCODE_COLUMN].iloc[0], "Property Type": "Flat", "Number of Flats with EPC": num_flats, "Number of Flats below C75": num_below_c75, From ebed7027ac721353593f089e015a9467ae6fa43e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 15 Feb 2025 22:33:01 +0000 Subject: [PATCH 30/72] adding minimums for the number of SAP points solar PV will deliver --- backend/Property.py | 4 +++- recommendations/Recommendations.py | 7 +++++++ recommendations/SolarPvRecommendations.py | 21 ++++++++++++++++++++- 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/backend/Property.py b/backend/Property.py index e19970eb..eaffd54d 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -1259,7 +1259,9 @@ class Property: if (self.building_id is not None) and (self.solar_panel_configuration is not None): return True - # If the property is in a conservation area, don't recommend + # If the property is in a conservation area, is listed or is a heriage building, solar panels + # become a difficult measure to generally get through planning restrictions and so we do not recommend + # solar panels if self.restricted_measures: return False diff --git a/recommendations/Recommendations.py b/recommendations/Recommendations.py index 42f4e783..715332a5 100644 --- a/recommendations/Recommendations.py +++ b/recommendations/Recommendations.py @@ -623,6 +623,13 @@ class Recommendations: if li_sap_limit is not None: property_phase_impact["sap"] = min(property_phase_impact["sap"], li_sap_limit) + if rec["type"] == "solar_pv": + # We use the SAP points in the recommendation as a minimum + property_phase_impact["sap"] = ( + rec["sap_points"] if property_phase_impact["sap"] < rec["sap_points"] else + property_phase_impact["sap"] + ) + # Insert this information into the recommendation. if not rec.get("survey", False): rec["sap_points"] = property_phase_impact["sap"] diff --git a/recommendations/SolarPvRecommendations.py b/recommendations/SolarPvRecommendations.py index 95f189d3..a97dbcb3 100644 --- a/recommendations/SolarPvRecommendations.py +++ b/recommendations/SolarPvRecommendations.py @@ -14,11 +14,16 @@ class SolarPvRecommendations: # This was previously set to 250w, but has been upped to 400 based on the systems used by Cotswolrd Energy Group SOLAR_PANEL_WATTAGE = 400 + # For domestic properties, we don't recommend a solar PV system with wattage outside of these + # bounds MAX_SYSTEM_WATTAGE = 6000 MIN_SYSTEM_WATTAGE = 1000 + # the maximum area of root we allow to be covered in solar panels for our recommendations. MAX_ROOF_AREA_PERCENTAGE = 0.7 + SAP_POINTS_PER_5_PERCENT_ROOF_COVERAGE = 1 + def __init__(self, property_instance): """ :param property_instance: Instance of the Property class, for the home associated to property_id @@ -212,6 +217,20 @@ class SolarPvRecommendations: roof_coverage_percent = round(recommendation_config["panneled_roof_area"] / roof_area * 100) # We round up to the nearest 5 roof_coverage_percent = np.ceil(roof_coverage_percent / 5) * 5 + + # Typically, we've observed that every 5% of additional roof coverage will result in at least + # an additional 1 SAP points (though often 2 points) Given this, we can add a reasonable minimum + # for the number of SAP points we might expect. We've observed that for some cases where properties + # are hitting the higher SAP scores (e.g. EPC A and above), the model can sometimes under-predict + # the number of SAP points. This appears to be due to a relatively small number of properties + # actually achieving the upper echelons of EPC rating. This can be the case if we're simulating a + # whole house retrofit where the home is getting complete insulation, a heat pump and solar panels. + # Because panels are the final recommendation, they are often the measure that takes the home + # into the medium to high EPC A ranges and so because of a lack of training data, this means that + # we might sometime under-predict. This minimum is intended to try and reduce the negative impact + # of this. This minimum is used in Recommendations.calculate_recommendation_impact + minimum_sap_points = (roof_coverage_percent / 5) * self.SAP_POINTS_PER_5_PERCENT_ROOF_COVERAGE + for has_battery in [False, True]: cost_result = self.costs.solar_pv( has_battery=has_battery, @@ -240,7 +259,7 @@ class SolarPvRecommendations: "description": description, "starting_u_value": None, "new_u_value": None, - "sap_points": None, + "sap_points": minimum_sap_points, "already_installed": already_installed, **cost_result, # This is required for simulating the SAP impact. solar_pv_percentage is between 0 & 1 so we From 89d49690b5c9ca4efb89f3879bb7c414098e5ea2 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 16 Feb 2025 17:02:51 +0000 Subject: [PATCH 31/72] added extraction of windows sap point --- etl/customers/remote_assessments/app.py | 12 ++++++------ recommendations/WindowsRecommendations.py | 12 ++++++++++-- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py index 70ceb76d..cce0f4fb 100644 --- a/etl/customers/remote_assessments/app.py +++ b/etl/customers/remote_assessments/app.py @@ -4,7 +4,7 @@ from dotenv import load_dotenv from utils.s3 import save_csv_to_s3 from etl.find_my_epc.AssetListEpcData import AssetListEpcData -PORTFOLIO_ID = 128 +PORTFOLIO_ID = 129 USER_ID = 8 load_dotenv(dotenv_path="backend/.env") @@ -19,9 +19,9 @@ def app(): asset_list = [ { - "address": "46", - "postcode": "BS6 7BD", - "uprn": 61091, + "address": "19", + "postcode": "IP21 4YJ", + "uprn": 2630134524, } ] asset_list = pd.DataFrame(asset_list) @@ -52,8 +52,8 @@ def app(): valuation_data = [ { - "uprn": 61091, - "valuation": 897_000 + "uprn": 2630134524, + "valuation": 96_000 } ] # Store valuation data to s3 diff --git a/recommendations/WindowsRecommendations.py b/recommendations/WindowsRecommendations.py index 1f755369..46e56c93 100644 --- a/recommendations/WindowsRecommendations.py +++ b/recommendations/WindowsRecommendations.py @@ -215,21 +215,29 @@ class WindowsRecommendations: "glazed-type": glazed_type_ending, } + measure_type = "double_glazing" if not is_secondary_glazing else "secondary_glazing" + + non_invasive_recommendation = next( + (r for r in self.property.non_invasive_recommendations if r["type"] in ["windows_glazing", measure_type]), + {} + ) + self.recommendation = [ { "phase": phase, "parts": [], "type": "windows_glazing", - "measure_type": "double_glazing" if not is_secondary_glazing else "secondary_glazing", + "measure_type": measure_type, "description": description, "starting_u_value": None, "new_u_value": None, - "sap_points": None, + "sap_points": non_invasive_recommendation.get("sap_points", None), "already_installed": already_installed, **cost_result, "is_secondary_glazing": is_secondary_glazing, "description_simulation": description_simulation, "simulation_config": simulation_config, + "survey": non_invasive_recommendation.get("survey", None), } ] From c09b693922c8c3c8ac55648de2772312f319d487 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 16 Feb 2025 18:25:17 +0000 Subject: [PATCH 32/72] minor tweaks to engine during remote assessments --- backend/app/assumptions.py | 1 + backend/app/plan/router.py | 2 +- etl/customers/remote_assessments/app.py | 14 ++++++++------ 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/backend/app/assumptions.py b/backend/app/assumptions.py index 841ec2c1..8d0c05be 100644 --- a/backend/app/assumptions.py +++ b/backend/app/assumptions.py @@ -54,4 +54,5 @@ DESCRIPTIONS_TO_FUEL_TYPES = { "Gas instantaneous at point of use": {"fuel": "Natural Gas", "cop": 0.85}, "Room heaters, wood logs": {"fuel": "Wood Logs", "cop": 1}, "Boiler and radiators, coal": {"fuel": "Coal", "cop": 0.85}, + "From main system, no cylinderstat": {"fuel": "Natural Gas", "cop": 0.85}, } diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 949c8e4c..76c172ee 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -338,7 +338,7 @@ def extract_property_request_data( # Because we have some non-invasive recommendations that match on address and postcode, but not UPRN # we need to check existence of uprn - has_uprn = "uprn" in non_invasive_recommendations[0] if non_invasive_recommendations else True + has_uprn = "uprn" in non_invasive_recommendations[0] if non_invasive_recommendations else False if has_uprn: has_uprn = non_invasive_recommendations[0]["uprn"] not in ["", None] diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py index cce0f4fb..ad97fd41 100644 --- a/etl/customers/remote_assessments/app.py +++ b/etl/customers/remote_assessments/app.py @@ -4,7 +4,7 @@ from dotenv import load_dotenv from utils.s3 import save_csv_to_s3 from etl.find_my_epc.AssetListEpcData import AssetListEpcData -PORTFOLIO_ID = 129 +PORTFOLIO_ID = 132 USER_ID = 8 load_dotenv(dotenv_path="backend/.env") @@ -19,9 +19,11 @@ def app(): asset_list = [ { - "address": "19", - "postcode": "IP21 4YJ", - "uprn": 2630134524, + "address": "3", + "postcode": "BB8 0JF", + "uprn": 100010509503, + "property_type": "House", + "built_form": "End-Terrace", } ] asset_list = pd.DataFrame(asset_list) @@ -52,8 +54,8 @@ def app(): valuation_data = [ { - "uprn": 2630134524, - "valuation": 96_000 + "uprn": 100010509503, + "valuation": 116_000 } ] # Store valuation data to s3 From 764dc7901f2e7fc117a4df1053b7d9fe7eb9ad34 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 18 Feb 2025 12:20:04 +0000 Subject: [PATCH 33/72] setting up EPC data extraction process for creation of reports --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- etl/customers/remote_assessments/app.py | 14 +-- etl/route_march_data_pull/app.py | 16 +-- survey_report/app.py | 152 +++++++++++++++++++++--- 5 files changed, 151 insertions(+), 35 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index df6c4faa..762580d9 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 50cad4ca..c916a158 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py index ad97fd41..15f59c5e 100644 --- a/etl/customers/remote_assessments/app.py +++ b/etl/customers/remote_assessments/app.py @@ -4,7 +4,7 @@ from dotenv import load_dotenv from utils.s3 import save_csv_to_s3 from etl.find_my_epc.AssetListEpcData import AssetListEpcData -PORTFOLIO_ID = 132 +PORTFOLIO_ID = 133 USER_ID = 8 load_dotenv(dotenv_path="backend/.env") @@ -19,11 +19,9 @@ def app(): asset_list = [ { - "address": "3", - "postcode": "BB8 0JF", - "uprn": 100010509503, - "property_type": "House", - "built_form": "End-Terrace", + "address": "40", + "postcode": "PE4 5BB", + "uprn": 100090220519, } ] asset_list = pd.DataFrame(asset_list) @@ -54,8 +52,8 @@ def app(): valuation_data = [ { - "uprn": 100010509503, - "valuation": 116_000 + "uprn": 100090220519, + "valuation": 135_000 } ] # Store valuation data to s3 diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 1b937b2d..f9cb7cbb 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -258,16 +258,16 @@ def app(): # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) # - Or the insulation required is loft/cavity (floors should be solid) - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater" - DATA_FILENAME = "Stonewater All Props for EPC Check 10.02.25.xlsx" - SHEET_NAME = "stonewater sap, insta" - POSTCODE_COLUMN = "Post Code" - FULLADDRESS_COLUMN = "Name" - ADDRESS1_COLUMN = "Name" - ADDRESS1_METHOD = None + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Community Housing" + DATA_FILENAME = "Community Housing PV data pull.xlsx" + SHEET_NAME = "Community Housing" + POSTCODE_COLUMN = "Postcode" + FULLADDRESS_COLUMN = "Full Address" + ADDRESS1_COLUMN = None + ADDRESS1_METHOD = "first_word" ADDRESS_COLS_TO_CONCAT = [] MISSING_POSTCODES_METHOD = None - PROPERTY_YEAR_BUILT = None + PROPERTY_YEAR_BUILT = "Build_Date" # Maps addresses to uprn in problematic cases MANUAL_UPRN_MAP = {} diff --git a/survey_report/app.py b/survey_report/app.py index be31bd52..774d2a15 100644 --- a/survey_report/app.py +++ b/survey_report/app.py @@ -1,4 +1,5 @@ import os +import requests import PyPDF2 from string import Template @@ -31,31 +32,135 @@ def generate_html_report(template_path, output_path, data): print(f"HTML report generated successfully: {output_path}") +class PlacidApi: + # Errors as defined by docs: https://placid.app/docs/2.0/rest/errors + ERROR_CODES = { + 400: "Bad request", + 401: "Unauthorized", + 404: "Template Not found", + 422: "Validation error", + 429: "Rate limit exceeded", + 500: "Internal server error", + } + + def __init__(self, api_key): + self.api_key = api_key + + self.headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + "Accept": "application/json", + } + + def create_pdf( + self, + template_uuid: str, + current_epc_rating: str, + current_epc_rating_colour: str, + post_retrofit_epc_rating: str, + post_retrofit_epc_rating_colour: str, + ): + url = "https://api.placid.app/api/rest/pdfs" + + body = { + "webhook_success": None, + "passthrough": None, + "pages": [ + { + "template_uuid": template_uuid, + "layers": { + "current_epc_rating": { + "text": current_epc_rating, + "text_color": current_epc_rating_colour, + }, + "post_retrofit_epc_rating": { + "text": post_retrofit_epc_rating, + "text_color": post_retrofit_epc_rating_colour, + } + }, + }, + ] + } + + response = requests.post( + url, + headers=self.headers, + json=body + ) + + response_body = response.json() + pdf_id = response_body["id"] + + def get_pdf(self, pdf_id: str): + """ + Poll the API every 5 seconds until the PDF is ready + """ + url = f"https://api.placid.app/api/rest/pdfs/{pdf_id}" + + response = requests.get( + url, + headers=self.headers + ) + response_body = response.json() + + url = response_body["pdf_url"] + # Download the PDF form this uurl + pdf_download = requests.get(url) + with open("output.pdf", "wb") as f: + f.write(pdf_download.content) + + def handle(): """ Performs the data extraction process for the survey report :return: """ + PLACID_API_KEY = "placid-mpkwidzer2mens9h-hifa3dmbxpfeghpa" + TEMPLATE_UUID = "hnwqgtumckfbf" + placid_api = PlacidApi(PLACID_API_KEY) + + EPC_COLOURS = { + "A": "#117d58", + "B": "#2da55c", + "C": "#8dbd40", + "D": "#f7cd14", + "E": "#f3a96a", + "F": "#ef8026", + "G": "#e41e3b", + } + folders = [ - "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 1", - "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2", - "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 3", - "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 4", - "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 5", + { + "site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 1/3 " + "WILLIS ROAD FLAT 1 PRE EPR SITE NOTES.pdf", + "epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 1/3 WILLIS " + "ROAD FLAT 1 PRE EPR PDF.pdf", + "scenario_epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 1/3 " + "WILLIS ROAD FLAT 1 POST EPR PDF.pdf" + }, + { + "site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2/3 " + "WILLIS ROAD FLAT 2 PRE EPR SITE NOTES.pdf", + "epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2/3 WILLIS " + "ROAD FLAT 2 PRE EPR PDF.pdf", + "scenario_epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2/3 " + "WILLIS ROAD FLAT 2 POST EPR PDF.pdf" + }, + { + "site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 3/3 " + "WILLIS ROAD FLAT 3 PRE EPR SITE NOTES.pdf", + "epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 3/3 WILLIS " + "ROAD FLAT 3 PRE EPR PDF.pdf", + "scenario_epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 3/3 " + "WILLIS ROAD FLAT 3 POST EPR PDF.pdf" + }, ] data = [] - for data_folder in folders: + for data_config in folders: - folder_contents = os.listdir(data_folder) - # We look for the following files: - # Site notes file_mapping = {} - for file in folder_contents: - # Check if it's a pdf file - if not file.endswith(".pdf"): - continue - filepath = os.path.join(data_folder, file) + for filename, filepath in data_config.items(): with (open(filepath, "rb") as f): pdf = PyPDF2.PdfReader(f) first_page = pdf.pages[0].extract_text() @@ -66,16 +171,27 @@ def handle(): # Check the report type report_type = detect_report_type(first_page) if report_type is not None: - file_mapping[report_type] = text + file_mapping[filename] = text # This is only set up to work with quido site notes so we must have it - site_notes_extractor = SiteNotesExtractor(file_mapping["quidos_site_notes"]) + site_notes_extractor = SiteNotesExtractor(file_mapping["site_notes"]) site_notes = site_notes_extractor.extract_all() # We also must have an EPR - epr_extractor = EPRExtractor(file_mapping["quidos_epr"]) + epr_extractor = EPRExtractor(file_mapping["epr"]) epr = epr_extractor.extract_all() + scenario_epr = EPRExtractor(file_mapping["scenario_epr"]) + scenario_epr = scenario_epr.extract_all() + + report_data = { + "template_uuid": TEMPLATE_UUID, + "current_epc_rating": site_notes["Current EPC Band"], + "current_epc_rating_colour": EPC_COLOURS[site_notes["Current EPC Band"]], + post_retrofit_epc_rating: str, + post_retrofit_epc_rating_colour: str, + } + # We now produce the combined data sheet which is the starting figure: data_sheet = {**epr, **site_notes} del data_sheet['Building Dimensions'] @@ -83,7 +199,9 @@ def handle(): data_sheet["Total Building Floor Area (m2)"] = data_sheet["Total Building Dimensions"]["floor_area"] data_sheet["Total Building Heat Loss Area (m2)"] = data_sheet["Total Building Dimensions"]["heat_loss_area"] del data_sheet["Total Building Dimensions"] + data.append(data_sheet) + data = pd.DataFrame(data) # Generate the HTML report From 0de14c4e286b05ecd881aa05f81f1f6172472589 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 18 Feb 2025 19:49:29 +0000 Subject: [PATCH 34/72] quidos site notes extraction --- backend/ml_models/Valuation.py | 26 ++++++++- etl/route_march_data_pull/app.py | 69 ++++++++++++++++++---- survey_report/app.py | 92 ++++++++++++++++++++++------- survey_report/extraction/quidos.py | 94 +++++++++++++++++++++++++++++- 4 files changed, 243 insertions(+), 38 deletions(-) diff --git a/backend/ml_models/Valuation.py b/backend/ml_models/Valuation.py index 720005d3..6d4852b2 100644 --- a/backend/ml_models/Valuation.py +++ b/backend/ml_models/Valuation.py @@ -1,5 +1,4 @@ import numpy as np -from scipy.constants import value class PropertyValuation: @@ -216,6 +215,30 @@ class PropertyValuation: cls.UPRN_VALUE_LOOKUP.get(property_instance.uprn) ) + current_epc = property_instance.data["current-energy-rating"] + + if not current_value: + return { + "current_value": 0, + "lower_bound_increased_value": 0, + "upper_bound_increased_value": 0, + "average_increased_value": 0, + "average_increase": 0 + } + + return cls.estimate_valuation_improvement(current_value, current_epc, target_epc, total_cost) + + @classmethod + def estimate_valuation_improvement(cls, current_value, current_epc, target_epc, total_cost=None): + """ + This function estimates the value of a property based on the current EPC rating and the target EPC rating + :param current_value: + :param current_epc: + :param target_epc: + :param total_cost: + :return: + """ + if not current_value: return { "current_value": 0, @@ -225,7 +248,6 @@ class PropertyValuation: "average_increase": 0 } - current_epc = property_instance.data["current-energy-rating"] # We get the spectrum of ratings between the current and target EPC epc_band_range = cls.EPC_BANDS[cls.EPC_BANDS.index(current_epc): cls.EPC_BANDS.index(target_epc) + 1] diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index f9cb7cbb..ee6a46d3 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -24,21 +24,24 @@ load_dotenv(dotenv_path="backend/.env") EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") -def get_data(asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map, epc_api_only=False): +def get_data( + asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map, uprn_column=None, + epc_api_only=False +): epc_data = [] errors = [] no_epc = [] for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): try: postcode = home[postcode_column] - house_number = home[address1_column].strip() + house_number = str(home[address1_column]).strip() full_address = home[fulladdress_column].strip() house_no = SearchEpc.get_house_number(address=str(house_number), postcode=postcode) if house_no is None: house_no = house_number uprn = manual_uprn_map.get(full_address, None) - if uprn is None and home.get("uprn"): - uprn = home["uprn"] + if uprn is None and home.get(uprn_column): + uprn = home[uprn_column] if pd.isnull(uprn): uprn = None @@ -149,7 +152,7 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column, m return epc_data, errors, no_epc -def extract_address1(asset_list, full_address_col, method="first_two_words"): +def extract_address1(asset_list, full_address_col, postcode_col, method="first_two_words"): if method == "first_two_words": asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ") return asset_list @@ -158,6 +161,13 @@ def extract_address1(asset_list, full_address_col, method="first_two_words"): asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0] return asset_list + if method == "house_number_extraction": + asset_list["address1_extracted"] = asset_list.apply( + lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]), + axis=1 + ) + return asset_list + raise ValueError(f"Method {method} not recognized") @@ -258,16 +268,29 @@ def app(): # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) # - Or the insulation required is loft/cavity (floors should be solid) - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Community Housing" - DATA_FILENAME = "Community Housing PV data pull.xlsx" - SHEET_NAME = "Community Housing" - POSTCODE_COLUMN = "Postcode" + # For Westward + # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward" + # DATA_FILENAME = "WESTWARD - completed list..xlsx" + # SHEET_NAME = "Sheet1" + # POSTCODE_COLUMN = "WFT EDIT Postcode" + # FULLADDRESS_COLUMN = "Address" + # ADDRESS1_COLUMN = None + # ADDRESS1_METHOD = "house_number_extraction" + # ADDRESS_COLS_TO_CONCAT = [] + # MISSING_POSTCODES_METHOD = None + # PROPERTY_YEAR_BUILT = "Build date" + # UPRN_COLUMN = "UPRN" + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" + DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx" + SHEET_NAME = "Sheet1" + POSTCODE_COLUMN = 'Full Address.1' FULLADDRESS_COLUMN = "Full Address" ADDRESS1_COLUMN = None ADDRESS1_METHOD = "first_word" ADDRESS_COLS_TO_CONCAT = [] MISSING_POSTCODES_METHOD = None - PROPERTY_YEAR_BUILT = "Build_Date" + PROPERTY_YEAR_BUILT = "Build Date" + UPRN_COLUMN = None # Maps addresses to uprn in problematic cases MANUAL_UPRN_MAP = {} @@ -299,7 +322,10 @@ def app(): if ADDRESS1_COLUMN is None: ADDRESS1_COLUMN = "address1_extracted" asset_list = extract_address1( - asset_list=asset_list, full_address_col=FULLADDRESS_COLUMN, method=ADDRESS1_METHOD + asset_list=asset_list, + full_address_col=FULLADDRESS_COLUMN, + postcode_col=POSTCODE_COLUMN, + method=ADDRESS1_METHOD ) if FULLADDRESS_COLUMN is None: @@ -315,6 +341,23 @@ def app(): asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].str.replace('\xa0', ' ', regex=False) asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].str.replace(' ', ' ', regex=False) + if UPRN_COLUMN is not None: + # Check if it's numeric and if so, make sure it's an integer + def convert_uprn(x): + + if pd.isnull(x): + return x + + # check if numeric + if np.isreal(x): + return str(int(x)) + + if str(x).isdigit(): + return str(int(x)) + return x + + asset_list[UPRN_COLUMN] = asset_list[UPRN_COLUMN].apply(convert_uprn) + # We check for duplicated addresses asset_list["deduper"] = asset_list[FULLADDRESS_COLUMN] + asset_list[POSTCODE_COLUMN] if asset_list["deduper"].duplicated().sum(): @@ -342,7 +385,8 @@ def app(): fulladdress_column=FULLADDRESS_COLUMN, address1_column=ADDRESS1_COLUMN, postcode_column=POSTCODE_COLUMN, - manual_uprn_map=MANUAL_UPRN_MAP + manual_uprn_map=MANUAL_UPRN_MAP, + uprn_column=UPRN_COLUMN ) # We now retrieve any failed properties @@ -535,6 +579,7 @@ def app(): # 3) If we have year in the asset list, we flag entries where the built year is different from the # EPC Age band if PROPERTY_YEAR_BUILT is not None: + raise Exception("THIS WAS WRONG!") asset_list["Does Age Match EPC Age Band?"] = asset_list.apply( lambda x: process_age_band(x, PROPERTY_YEAR_BUILT), axis=1 ) diff --git a/survey_report/app.py b/survey_report/app.py index 774d2a15..f6eddb8d 100644 --- a/survey_report/app.py +++ b/survey_report/app.py @@ -32,6 +32,15 @@ def generate_html_report(template_path, output_path, data): print(f"HTML report generated successfully: {output_path}") +def stringify_number(num: int, rounding: bool = True) -> str: + if num < 100000: # 5 figures or fewer + rounded_num = ((num + 99) // 100) * 100 if rounding else num + return f"{rounded_num:,}" + else: # More than 5 figures + rounded_num = ((num + 999) // 1000) * 1000 if rounding else num + return f"{rounded_num // 1000}k" + + class PlacidApi: # Errors as defined by docs: https://placid.app/docs/2.0/rest/errors ERROR_CODES = { @@ -89,7 +98,8 @@ class PlacidApi: ) response_body = response.json() - pdf_id = response_body["id"] + + return response_body def get_pdf(self, pdf_id: str): """ @@ -106,20 +116,22 @@ class PlacidApi: url = response_body["pdf_url"] # Download the PDF form this uurl pdf_download = requests.get(url) - with open("output.pdf", "wb") as f: + with open("survey_report/example_data/output.pdf", "wb") as f: f.write(pdf_download.content) -def handle(): +def handler(): """ Performs the data extraction process for the survey report :return: """ PLACID_API_KEY = "placid-mpkwidzer2mens9h-hifa3dmbxpfeghpa" - TEMPLATE_UUID = "hnwqgtumckfbf" + TEMPLATE_UUID = "5bst9mh1q9lk9" placid_api = PlacidApi(PLACID_API_KEY) + current_property_value = 250000 # Needs to be an input + EPC_COLOURS = { "A": "#117d58", "B": "#2da55c", @@ -136,26 +148,27 @@ def handle(): "WILLIS ROAD FLAT 1 PRE EPR SITE NOTES.pdf", "epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 1/3 WILLIS " "ROAD FLAT 1 PRE EPR PDF.pdf", - "scenario_epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 1/3 " - "WILLIS ROAD FLAT 1 POST EPR PDF.pdf" + "scenario_site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data" + "/Flat 1/3 WILLIS ROAD FLAT 1 POST EPR SITE NOTES.pdf" }, { "site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2/3 " "WILLIS ROAD FLAT 2 PRE EPR SITE NOTES.pdf", "epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2/3 WILLIS " "ROAD FLAT 2 PRE EPR PDF.pdf", - "scenario_epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2/3 " - "WILLIS ROAD FLAT 2 POST EPR PDF.pdf" + "scenario_site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data" + "/Flat 2/3 WILLIS ROAD FLAT 2 POST EPR SITE NOTES.pdf" }, { "site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 3/3 " "WILLIS ROAD FLAT 3 PRE EPR SITE NOTES.pdf", "epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 3/3 WILLIS " "ROAD FLAT 3 PRE EPR PDF.pdf", - "scenario_epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 3/3 " - "WILLIS ROAD FLAT 3 POST EPR PDF.pdf" + "scenario_site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data" + "/Flat 3/3 WILLIS ROAD FLAT 3 POST EPR SITE NOTES.pdf" }, ] + data = [] for data_config in folders: @@ -181,26 +194,61 @@ def handle(): epr_extractor = EPRExtractor(file_mapping["epr"]) epr = epr_extractor.extract_all() - scenario_epr = EPRExtractor(file_mapping["scenario_epr"]) - scenario_epr = scenario_epr.extract_all() + # Valuation simulation + scenario_site_notes_extractor = SiteNotesExtractor(file_mapping["scenario_site_notes"]) + scenario_site_notes = scenario_site_notes_extractor.extract_all() + + from backend.ml_models.Valuation import PropertyValuation + valuation_uplift = PropertyValuation.estimate_valuation_improvement( + current_value=current_property_value, + current_epc=site_notes["Current EPC Band"], + target_epc=scenario_site_notes["Current EPC Band"], + ) + # TODO - should convert this, when it's more than 5 figures and we should certainly stringify this + + valuation_difference = round(valuation_uplift["average_increased_value"] - current_property_value) + + # Prepare the data for output + bill_savings = round( + site_notes['Estimated Annual Energy Cost (£)'] - scenario_site_notes['Estimated Annual Energy Cost (£)'] + ) + + carbon_savings = round( + site_notes["Current Carbon Emissions (TCO2)"] - scenario_site_notes["Current Carbon Emissions (TCO2)"], + 2 + ) + + payback_period = None + if payback_period is None: + raise NotImplementedError("Implement me") + + # We extract the measures from the site notes report_data = { - "template_uuid": TEMPLATE_UUID, "current_epc_rating": site_notes["Current EPC Band"], "current_epc_rating_colour": EPC_COLOURS[site_notes["Current EPC Band"]], - post_retrofit_epc_rating: str, - post_retrofit_epc_rating_colour: str, + "post_retrofit_epc_rating": scenario_site_notes["Current EPC Band"], + "post_retrofit_epc_rating_colour": EPC_COLOURS[scenario_site_notes["Current EPC Band"]], + "bill_savings": stringify_number(bill_savings), + "valuation_improvement": stringify_number(valuation_difference), + "carbon_savings": carbon_savings, + } # We now produce the combined data sheet which is the starting figure: - data_sheet = {**epr, **site_notes} - del data_sheet['Building Dimensions'] - # We unnest the Total Building Dimensions - data_sheet["Total Building Floor Area (m2)"] = data_sheet["Total Building Dimensions"]["floor_area"] - data_sheet["Total Building Heat Loss Area (m2)"] = data_sheet["Total Building Dimensions"]["heat_loss_area"] - del data_sheet["Total Building Dimensions"] + # data_sheet = {**epr, **site_notes} + # del data_sheet['Building Dimensions'] + # # We unnest the Total Building Dimensions + # data_sheet["Total Building Floor Area (m2)"] = data_sheet["Total Building Dimensions"]["floor_area"] + # data_sheet["Total Building Heat Loss Area (m2)"] = data_sheet["Total Building Dimensions"]["heat_loss_area"] + # del data_sheet["Total Building Dimensions"] - data.append(data_sheet) + create_pdf_response = placid_api.create_pdf( + template_uuid=TEMPLATE_UUID, **report_data + ) + # {'id': 769832, 'type': 'pdf', 'status': 'queued', 'pdf_url': None, 'transfer_url': None, 'passthrough': None} + # Download locally + placid_api.get_pdf(create_pdf_response["id"]) data = pd.DataFrame(data) diff --git a/survey_report/extraction/quidos.py b/survey_report/extraction/quidos.py index 374df084..2e772886 100644 --- a/survey_report/extraction/quidos.py +++ b/survey_report/extraction/quidos.py @@ -108,8 +108,98 @@ class SiteNotesExtractor: self.extract_carbon_emissions() self.extract_bills_estimate() self.extract_building_dimensions() + + # Extract specific measures + # Primary wall + # Secondary wall + # Roof + # Floor + # Heating system + # Hot water system + # Windows + # Doors + # Lighting + # Ventilation + # Solar + return self.data + def extract_walls(self): + """ + Extracts wall type, insulation, dry-lining, and thickness for each building part, + including any alternative wall details within the 7.0 Walls section of the summary PDF text. + """ + + text = self.text + wall_data = [] + + # Isolate the 7.0 Walls section + wall_section_match = re.search(r"7\.0 Walls\n(.*?)\n8\.0 Roofs", text, re.DOTALL) + if not wall_section_match: + raise ValueError("Failed to locate the walls section in the text.") + + wall_section = wall_section_match.group(1) + + # Define patterns to match walls for each building part + wall_pattern = re.compile( + r"(?P
Main Property(?: Alternative)?|Extension \d+)\s*\n" + r"(?:Construction\s*(?P[^\n]*)\n)?" + r"(?:Insulation\s*(?P[^\n]*)\n)?" + r"(?:Insulation Thickness\(mm\)\s*(?P[^\n]*)\n)?" + r"(?:Wall Thickness Measured\?\s*(?P[^\n]*)\n)?" + r"(?:Wall Thickness\(mm\)\s*(?P\d+))?", + re.MULTILINE + ) + + # TODO: We aren't effectively picking up alternative walls + # alt_wall_pattern = re.compile( + # r"Alternative Wall Sheltered\s*.*?\n" + # r".*?Construction\s*(?P[^\n]*)\n" + # r"Insulation\s*(?P[^\n]*)\n" + # r"Insulation Thickness\(mm\)\s*(?P[^\n]*)\n" + # r"Wall Thickness Measured\?\s*(?P[^\n]*)\n" + # r"Wall Thickness\(mm\)\s*(?P\d+)?", + # re.MULTILINE + # ) + + for match in wall_pattern.finditer(wall_section): + building_part = match.group("section") + # has_alternative_wall = "Alternative" in building_part + building_part = "Main Property" if "Main Property" in building_part else building_part + + wall_entry = { + "Building Part": building_part, + "Wall Type": match.group("construction") or "Unknown", + "Wall Insulation": match.group("insulation") or "Unknown", + "Insulation Thickness (mm)": match.group("insulation_thickness") or "Unknown", + "Wall Thickness Measured": match.group("thickness_measured") or "Unknown", + "Wall Thickness (mm)": int(match.group("thickness")) if match.group("thickness") and match.group( + "thickness").isdigit() else None, + "Alternative Wall Type": None, + "Alternative Wall Insulation": None, + "Alternative Insulation Thickness (mm)": None, + "Alternative Wall Thickness Measured": None, + "Alternative Wall Thickness (mm)": None, + } + + # Check if an alternative wall section exists + # if has_alternative_wall: + # alt_match = alt_wall_pattern.search(wall_section, match.end()) + # if alt_match: + # wall_entry["Alternative Wall Type"] = alt_match.group("alt_construction") or "Unknown" + # wall_entry["Alternative Wall Insulation"] = alt_match.group("alt_insulation") or "Unknown" + # wall_entry["Alternative Insulation Thickness (mm)"] = alt_match.group( + # "alt_insulation_thickness") or "Unknown" + # wall_entry["Alternative Wall Thickness Measured"] = alt_match.group( + # "alt_thickness_measured") or "Unknown" + # wall_entry["Alternative Wall Thickness (mm)"] = int( + # alt_match.group("alt_thickness")) if alt_match.group("alt_thickness") and alt_match.group( + # "alt_thickness").isdigit() else None + + wall_data.append(wall_entry) + + return wall_data + class EPRExtractor: """ @@ -123,7 +213,7 @@ class EPRExtractor: self.text = pdf_text self.data = {} - def extract_heating_data(self): + def extract_heating_consumption(self): """ Extracts space heating and water heating values from the report. """ @@ -162,5 +252,5 @@ class EPRExtractor: Runs all extraction methods and returns a dictionary with extracted data. """ self.extract_address() - self.extract_heating_data() + self.extract_heating_consumption() return self.data From 55d2df17877d184b3bd9874a6da47cab6d3e6450 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 19 Feb 2025 10:12:22 +0000 Subject: [PATCH 35/72] debygging epc searcher --- backend/SearchEpc.py | 3 + etl/route_march_data_pull/app.py | 95 +++++++++++++++++++++++++------- 2 files changed, 77 insertions(+), 21 deletions(-) diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index c74a0b1f..e8a9dfaa 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -331,6 +331,9 @@ class SearchEpc: if row["lmk-key"] not in seen and not seen.add(row["lmk-key"]) ] + if data: + api_response["msg"] = self.SUCCESS + return api_response["msg"] def filter_rows(self, rows, property_type=None, address=None): diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index ee6a46d3..57239989 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -4,6 +4,7 @@ from BaseUtility import Definitions import pandas as pd import numpy as np from tqdm import tqdm +from datetime import datetime from dotenv import load_dotenv from backend.SearchEpc import SearchEpc @@ -172,7 +173,10 @@ def extract_address1(asset_list, full_address_col, postcode_col, method="first_t def process_age_band(x, year_built_column): - year_built = float(x[year_built_column]) + if isinstance(x[year_built_column], datetime): + year_built = x[year_built_column].year + else: + year_built = float(x[year_built_column]) if pd.isnull(x["Property Age Band"]) or ( x["Property Age Band"] in Definitions.DATA_ANOMALY_MATCHES @@ -195,6 +199,12 @@ def process_age_band(x, year_built_column): if year_built < 2007: return "EPC Age Band is older than Year Built" + if x["Property Age Band"] == "England and Wales: 2012 onwards": + if year_built >= 2012: + return "EPC Age Band Matches Year Built" + if year_built < 2012: + return "EPC Age Band is older than Year Built" + if x["Property Age Band"] == "England and Wales: before 1900": if year_built < 1900: return "EPC Age Band Matches Year Built" @@ -206,7 +216,7 @@ def process_age_band(x, year_built_column): # so we extract the lower and upper date age_band = x["Property Age Band"].split(": ")[1] lower_date, upper_date = age_band.split("-") - if year_built <= float(upper_date) and year_built <= float(upper_date): + if year_built <= float(upper_date) and year_built >= float(lower_date): return "EPC Age Band Matches Year Built" if year_built > float(upper_date): @@ -269,28 +279,33 @@ def app(): # - Or the insulation required is loft/cavity (floors should be solid) # For Westward - # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward" - # DATA_FILENAME = "WESTWARD - completed list..xlsx" - # SHEET_NAME = "Sheet1" - # POSTCODE_COLUMN = "WFT EDIT Postcode" - # FULLADDRESS_COLUMN = "Address" - # ADDRESS1_COLUMN = None - # ADDRESS1_METHOD = "house_number_extraction" - # ADDRESS_COLS_TO_CONCAT = [] - # MISSING_POSTCODES_METHOD = None - # PROPERTY_YEAR_BUILT = "Build date" - # UPRN_COLUMN = "UPRN" - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" - DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx" + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward" + DATA_FILENAME = "WESTWARD - completed list..xlsx" SHEET_NAME = "Sheet1" - POSTCODE_COLUMN = 'Full Address.1' - FULLADDRESS_COLUMN = "Full Address" + POSTCODE_COLUMN = "WFT EDIT Postcode" + FULLADDRESS_COLUMN = "Address" ADDRESS1_COLUMN = None - ADDRESS1_METHOD = "first_word" + ADDRESS1_METHOD = "house_number_extraction" ADDRESS_COLS_TO_CONCAT = [] MISSING_POSTCODES_METHOD = None - PROPERTY_YEAR_BUILT = "Build Date" - UPRN_COLUMN = None + PROPERTY_YEAR_BUILT = "Build date" + UPRN_COLUMN = "UPRN" + # If we have the non-intrusives data, this should be true + HAS_NON_INTRUSIVES = True + + # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" + # DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx" + # SHEET_NAME = "Sheet1" + # POSTCODE_COLUMN = 'Full Address.1' + # FULLADDRESS_COLUMN = "Full Address" + # ADDRESS1_COLUMN = None + # ADDRESS1_METHOD = "first_word" + # ADDRESS_COLS_TO_CONCAT = [] + # MISSING_POSTCODES_METHOD = None + # PROPERTY_YEAR_BUILT = "Build Date" + # UPRN_COLUMN = None + # # If we have the non-intrusives data, this should be true + # HAS_NON_INTRUSIVES = True # Maps addresses to uprn in problematic cases MANUAL_UPRN_MAP = {} @@ -358,6 +373,20 @@ def app(): asset_list[UPRN_COLUMN] = asset_list[UPRN_COLUMN].apply(convert_uprn) + # We attempt to process the year built column + if PROPERTY_YEAR_BUILT is not None: + # We check if we have a datetime + if isinstance(asset_list[PROPERTY_YEAR_BUILT].iloc[0], datetime): + # We treat any string columns - with common values we see + datetime_remap = { + "Pre 1900": datetime(year=1899, month=12, day=31), + } + asset_list[PROPERTY_YEAR_BUILT] = asset_list[PROPERTY_YEAR_BUILT].replace(datetime_remap) + + asset_list[PROPERTY_YEAR_BUILT] = pd.to_datetime(asset_list[PROPERTY_YEAR_BUILT]) + # Convert this to year + asset_list[PROPERTY_YEAR_BUILT] = asset_list[PROPERTY_YEAR_BUILT].dt.year + # We check for duplicated addresses asset_list["deduper"] = asset_list[FULLADDRESS_COLUMN] + asset_list[POSTCODE_COLUMN] if asset_list["deduper"].duplicated().sum(): @@ -579,11 +608,35 @@ def app(): # 3) If we have year in the asset list, we flag entries where the built year is different from the # EPC Age band if PROPERTY_YEAR_BUILT is not None: - raise Exception("THIS WAS WRONG!") asset_list["Does Age Match EPC Age Band?"] = asset_list.apply( lambda x: process_age_band(x, PROPERTY_YEAR_BUILT), axis=1 ) + if HAS_NON_INTRUSIVES: + # Empty cavity: + # 1) Has been flagged on the non-intrusives as being empty or partially filled + # 2) The age is before 1995 + # 3) Remove anything that likley has access issues + asset_list["Suitable for Cavity Fill"] = ( + (asset_list["Construction"] == "CAVITY") & + asset_list["Insulated"].isin(["EMPTY", "PARTIAL"]) & + ( + (asset_list[PROPERTY_YEAR_BUILT] <= 1995) # TODO, Or if the EPC age band is < 1995 + ) + ) + + # asset_list["Suitable for Extraction"] = + asset_list[ + (asset_list["Construction"] == "Cavity") & + asset_list["Insulated"].isin(["RETRO DRILLED"]) & + ( + (asset_list[PROPERTY_YEAR_BUILT] <= 1995) + ) & + ( + asset_list[] + ) + ] + # 4) Flag properties that look like they're good candidates for solar installs # Firstly, flag if the fabric is completely done From 8432b7d202c24962bae64b04023600de13a6a03d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 19 Feb 2025 11:50:28 +0000 Subject: [PATCH 36/72] creating the asset list class --- asset_list/AssetList.py | 64 ++++++++++++ etl/route_march_data_pull/app.py | 166 +++++++++++++++++++++---------- 2 files changed, 180 insertions(+), 50 deletions(-) create mode 100644 asset_list/AssetList.py diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py new file mode 100644 index 00000000..2a16e82f --- /dev/null +++ b/asset_list/AssetList.py @@ -0,0 +1,64 @@ +import os +import pandas as pd + + +class AssetList: + """ + This class is used to standardise asset lists so that we can process the core information in a consistent manner. + """ + + # These are the accepted methods we have for cleaning the address1 column + ADDRESS_1_CLEANING_METHODS = [ + "first_two_words", # This method will split on the fist two words, where the separator is a space + "first_word", # This method will split on the first word, where the separator is a space + "house_number_extraction", # This method will use the NLP model in SearchEPC to extract the housenumber + "address1_extraction" # This method will use the NLP model to extract address1 + ] + + def __init__( + self, + local_filepath, + sheet_name, + address1_colname, + postcode_colname, + full_address_colname, + full_address_cols_to_concat=None, + missing_postcodes_method=None, + landlord_year_built=None, + landlord_uprn=None, + header=0 + ): + self.local_filepath = local_filepath + self.sheet_name = sheet_name + self.standardised_asset_list = None + # Read in the data + self.raw_asset_list = pd.read_excel(local_filepath, header=header, sheet_name=sheet_name) + + # We detect the presence of the non-intrusive columns + self.non_intrusives_present = True if "CIGA Check Required" in self.raw_asset_list.columns else False + + # Names of columns + self.address1_colname = address1_colname + self.postcode_colname = postcode_colname + self.full_address_colname = full_address_colname + self.landlord_year_built = landlord_year_built + self.landlord_uprn = landlord_uprn + + # parameters for cleaning + self.full_address_cols_to_concat = full_address_cols_to_concat + self.missing_postcodes_method = missing_postcodes_method + + def standardise(self): + """ + This function is used to standardise the asset list + :return: standardised asset list + """ + + # We keep just the columns we care about and will work through the various columns and standardise + self.standardised_asset_list = self.raw_asset_list[ + [ + + ] + ] + + raise NotImplementedError diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 57239989..06082774 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -5,6 +5,7 @@ import pandas as pd import numpy as np from tqdm import tqdm from datetime import datetime +from asset_list.AssetList import AssetList from dotenv import load_dotenv from backend.SearchEpc import SearchEpc @@ -172,60 +173,107 @@ def extract_address1(asset_list, full_address_col, postcode_col, method="first_t raise ValueError(f"Method {method} not recognized") -def process_age_band(x, year_built_column): - if isinstance(x[year_built_column], datetime): - year_built = x[year_built_column].year - else: - year_built = float(x[year_built_column]) +def process_age_band(asset_list, year_built_column): + processed_age_band = [] + for _, x in asset_list.iterrows(): - if pd.isnull(x["Property Age Band"]) or ( - x["Property Age Band"] in Definitions.DATA_ANOMALY_MATCHES - ) or pd.isnull(year_built): - return "No EPC Age Band" + if pd.isnull(x["Property Age Band"]) or ( + x["Property Age Band"] in Definitions.DATA_ANOMALY_MATCHES + ): + processed_age_band.append({ + "row_id": x["row_id"], + "epc_year_lower_bound": None, + "epc_year_upper_bound": None, + "Does Age Match EPC Age Band?": "No EPC Age Band" + }) + continue - # We check if we have a numeric data - if x["Property Age Band"].isdigit(): - if year_built == float(x["Property Age Band"]): - return "EPC Age Band Matches Year Built" - if year_built > float(x["Property Age Band"]): - return "EPC Age Band is older than Year Built" - if year_built < float(x["Property Age Band"]): - return "EPC Age Band is newer than Year Built" + # We exatract the upper and lower bounds + if x["Property Age Band"] in ["England and Wales: 2007 onwards", "England and Wales: 2012 onwards"]: + year_lower_bound = 2007 if x["Property Age Band"] == "England and Wales: 2007 onwards" else 2012 - # Handle specific case - if x["Property Age Band"] == "England and Wales: 2007 onwards": - if year_built >= 2007: - return "EPC Age Band Matches Year Built" - if year_built < 2007: - return "EPC Age Band is older than Year Built" + if pd.isnull(x[year_built_column]): + age_band_matches = "No Year Built From Landlord" + else: + age_band_matches = ( + "EPC Age Band Matches Year Built" if x[year_built_column] >= year_lower_bound + else "EPC Age Band is older than Year Built" + ) - if x["Property Age Band"] == "England and Wales: 2012 onwards": - if year_built >= 2012: - return "EPC Age Band Matches Year Built" - if year_built < 2012: - return "EPC Age Band is older than Year Built" + processed_age_band.append( + { + "row_id": x["row_id"], + "epc_year_lower_bound": year_lower_bound, + "epc_year_upper_bound": None, + "Does Age Match EPC Age Band?": age_band_matches + } + ) + continue - if x["Property Age Band"] == "England and Wales: before 1900": - if year_built < 1900: - return "EPC Age Band Matches Year Built" - if year_built >= 1900: - return "EPC Age Band is newer than Year Built" + if x["Property Age Band"] == "England and Wales: before 1900": - # Age band will be formatted as such: - # 'England and Wales: {upper date}-{lower date}' - # so we extract the lower and upper date - age_band = x["Property Age Band"].split(": ")[1] - lower_date, upper_date = age_band.split("-") - if year_built <= float(upper_date) and year_built >= float(lower_date): - return "EPC Age Band Matches Year Built" + if pd.isnull(x[year_built_column]): + age_band_matches = "No Year Built From Landlord" + else: + age_band_matches = ( + "EPC Age Band Matches Year Built" if x[year_built_column] < 1900 + else "EPC Age Band is newer than Year Built" + ) - if year_built > float(upper_date): - return "EPC Age Band is older than Year Built" + processed_age_band.append( + { + "row_id": x["row_id"], + "epc_year_lower_bound": None, + "epc_year_upper_bound": 1899, + "Does Age Match EPC Age Band?": age_band_matches + } + ) + continue - if year_built < float(upper_date): - return "EPC Age Band is newer than Year Built" + if x["Property Age Band"].isdigit(): - raise Exception("Should not reach here") + if pd.isnull(x[year_built_column]): + age_band_matches = "No Year Built From Landlord" + else: + age_band_matches = ( + "EPC Age Band Matches Year Built" if x[year_built_column] == int(x["Property Age Band"]) + else "EPC Age Band is different from Year Built" + ) + + processed_age_band.append( + { + "row_id": x["row_id"], + "epc_year_lower_bound": int(x["Property Age Band"]), + "epc_year_upper_bound": int(x["Property Age Band"]), + "Does Age Match EPC Age Band?": age_band_matches + } + ) + continue + + # Oherwise, we extract the upper and lower bounds + age_band = x["Property Age Band"].split(": ")[1] + lower_date, upper_date = age_band.split("-") + + age_band_matches = ( + "EPC Age Band Matches Year Built" if (x[year_built_column] >= float(lower_date)) and ( + x[year_built_column] <= float(upper_date) + ) + else "EPC Age Band is older than Year Built" if x[year_built_column] > float(upper_date) + else "EPC Age Band is newer than Year Built" + ) + + processed_age_band.append( + { + "row_id": x["row_id"], + "epc_year_lower_bound": int(lower_date), + "epc_year_upper_bound": int(upper_date), + "Does Age Match EPC Age Band?": age_band_matches + } + ) + + processed_age_band = pd.DataFrame(processed_age_band) + + return processed_age_band def app(): @@ -282,16 +330,27 @@ def app(): DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward" DATA_FILENAME = "WESTWARD - completed list..xlsx" SHEET_NAME = "Sheet1" + POSTCODE_COLUMN = "WFT EDIT Postcode" FULLADDRESS_COLUMN = "Address" ADDRESS1_COLUMN = None ADDRESS1_METHOD = "house_number_extraction" + ADDRESS_COLS_TO_CONCAT = [] MISSING_POSTCODES_METHOD = None PROPERTY_YEAR_BUILT = "Build date" UPRN_COLUMN = "UPRN" # If we have the non-intrusives data, this should be true HAS_NON_INTRUSIVES = True + PROPERTY_TYPE_COLUMN = "Location type" # This will be used to identify and remove bedsits + + invalid_property_types_dictionary = ["bedsit", "bed-sit", "bed sit"] + + asset_list = AssetList( + local_filepath=os.path.join(DATA_FOLDER, DATA_FILENAME), + header=0, + sheet_name=SHEET_NAME + ) # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" # DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx" @@ -608,8 +667,10 @@ def app(): # 3) If we have year in the asset list, we flag entries where the built year is different from the # EPC Age band if PROPERTY_YEAR_BUILT is not None: - asset_list["Does Age Match EPC Age Band?"] = asset_list.apply( - lambda x: process_age_band(x, PROPERTY_YEAR_BUILT), axis=1 + # We process the age band and merge it on + processed_age_band = process_age_band(asset_list, PROPERTY_YEAR_BUILT) + asset_list = asset_list.merge( + processed_age_band, how="left", on="row_id" ) if HAS_NON_INTRUSIVES: @@ -621,7 +682,12 @@ def app(): (asset_list["Construction"] == "CAVITY") & asset_list["Insulated"].isin(["EMPTY", "PARTIAL"]) & ( - (asset_list[PROPERTY_YEAR_BUILT] <= 1995) # TODO, Or if the EPC age band is < 1995 + # Shold we defer to the year built provided by the HA? + (asset_list[PROPERTY_YEAR_BUILT] <= 1995) | (asset_list["epc_year_upper_bound"] <= 1995) + ) & + ( + # We check if the property type column contains one of the invalid property types + ~asset_list[PROPERTY_TYPE_COLUMN].str.lower().str.contains("|".join(invalid_property_types_dictionary)) ) ) @@ -633,9 +699,9 @@ def app(): (asset_list[PROPERTY_YEAR_BUILT] <= 1995) ) & ( - asset_list[] + asset_list[PROPERTY_TYPE_COLUMN] ) - ] + ] # 4) Flag properties that look like they're good candidates for solar installs # Firstly, flag if the fabric is completely done From 7e9347e530cc52fe38ceef66163447d6fd556b5e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 19 Feb 2025 12:53:09 +0000 Subject: [PATCH 37/72] setting up libpostal --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- asset_list/AssetList.py | 71 +++++++++- asset_list/README.md | 172 +++++++++++++++++++++++ asset_list/requirements.txt | 3 + asset_list/tests/test_standardisation.py | 9 ++ etl/route_march_data_pull/app.py | 18 ++- 7 files changed, 272 insertions(+), 5 deletions(-) create mode 100644 asset_list/README.md create mode 100644 asset_list/requirements.txt create mode 100644 asset_list/tests/test_standardisation.py diff --git a/.idea/Model.iml b/.idea/Model.iml index 762580d9..96ad7a95 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index c916a158..fb10c6b0 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 2a16e82f..35da9c3b 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -1,5 +1,10 @@ import os +import usaddress import pandas as pd +from utils.logger import setup_logger +from backend.SearchEpc import SearchEpc + +logger = setup_logger() class AssetList: @@ -15,6 +20,15 @@ class AssetList: "address1_extraction" # This method will use the NLP model to extract address1 ] + STANDARD_PROPERTY_TYPES = [ + "house", + "flat", + "bungalow", + "maisonette", + "park home", + "block house", + ] + def __init__( self, local_filepath, @@ -26,6 +40,10 @@ class AssetList: missing_postcodes_method=None, landlord_year_built=None, landlord_uprn=None, + landlord_property_type=None, + landlord_wall_construction=None, + landlord_heating_system=None, + landlord_existing_pv=None, header=0 ): self.local_filepath = local_filepath @@ -43,21 +61,72 @@ class AssetList: self.full_address_colname = full_address_colname self.landlord_year_built = landlord_year_built self.landlord_uprn = landlord_uprn + self.landlord_property_type = landlord_property_type + self.landlord_wall_construction = landlord_wall_construction + self.landlord_heating_system = landlord_heating_system + self.landlord_existing_pv = landlord_existing_pv # parameters for cleaning self.full_address_cols_to_concat = full_address_cols_to_concat self.missing_postcodes_method = missing_postcodes_method + self.debug_information = { + "property_type": None, + "wall_construction": None, + "heating_system": None, + "existing_pv": None + } + + @classmethod + def _extract_address1(cls, asset_list, full_address_col, postcode_col, method="first_two_words"): + + if method not in cls.ADDRESS_1_CLEANING_METHODS: + raise ValueError(f"Method {method} for producing address1 not recognized") + + if method == "first_two_words": + asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ") + return asset_list + + if method == "first_word": + asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0] + return asset_list + + if method == "house_number_extraction": + asset_list["address1_extracted"] = asset_list.apply( + lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]), + axis=1 + ) + return asset_list + + if method == "address1_extraction": + + x = asset_list_df[FULLADDRESS_COLUMN].values[0] + parsed = usaddress.parse(x) + + def extract_address_1(): + + + raise ValueError(f"Method {method} not recognized") + + @staticmethod + def _address1_extraction(x): + + def standardise(self): """ This function is used to standardise the asset list :return: standardised asset list """ + if self.address1_colname is None: + # If we do not have this, we produce it + + # We keep just the columns we care about and will work through the various columns and standardise self.standardised_asset_list = self.raw_asset_list[ [ - + self.address1_colname, self.postcode_colname, self.full_address_colname, + self.landlord_year_built, self.landlord_uprn, self.landlord_property_type ] ] diff --git a/asset_list/README.md b/asset_list/README.md new file mode 100644 index 00000000..1bf734a4 --- /dev/null +++ b/asset_list/README.md @@ -0,0 +1,172 @@ +# libpostal Installation Guide for macOS M1 + +## Overview + +`libpostal` is a fast, open-source address parsing and normalization library, designed for global addresses. This guide +provides detailed steps to install `libpostal` on macOS with Apple Silicon (M1/M2) and use it with Python. + +--- + +## 📌 Prerequisites + +Before installing `libpostal`, ensure you have the necessary dependencies installed. + +### **1️⃣ Install Required Dependencies** + +Open a terminal and run: + +```bash +brew install curl autoconf automake libtool pkg-config +``` + +### **2️⃣ Clone the libpostal Repository** + +```bash +git clone https://github.com/openvenues/libpostal.git +cd libpostal +``` + +### **3️⃣ Run Bootstrap Script** + +```bash +./bootstrap.sh +``` + +### **4️⃣ Configure the Build (Important for M1 Macs)** + +Since M1 chips do not support SSE2 natively, you **must** disable SSE2 for compatibility. + +```bash +./configure --disable-sse2 --datadir=/usr/local/libpostal_data +``` + +*(You can replace `/usr/local/libpostal_data` with another directory that has a few GB of space.)* + +### **5️⃣ Compile and Install** + +```bash +make -j$(sysctl -n hw.ncpu) +sudo make install +``` + +### **6️⃣ Install Python Bindings** + +Once `libpostal` is installed, install the Python package: + +```bash +pip install postal +``` + +--- + +## ✅ **Verify Installation** + +To check if `libpostal` was installed successfully, run: + +```bash +python -c "import postal; print(postal.parser.parse('23 Clifton Hill, Newtown, Exeter, EX1 2DL'))" +``` + +**Expected Output:** + +``` +[('23', 'house_number'), ('Clifton Hill', 'road'), ('Newtown', 'city'), ('Exeter', 'city'), ('EX1 2DL', 'postcode')] +``` + +--- + +## 📌 **Usage Example in Python** + +### **Address Parsing** + +```python +from postal.parser import parse + +address = "23 Clifton Hill, Newtown, Exeter, EX1 2DL" +parsed_address = dict(parse(address)) + +print(parsed_address) +``` + +**Expected Output:** + +```python +{ + 'house_number': '23', + 'road': 'Clifton Hill', + 'city': 'Newtown', + 'city': 'Exeter', + 'postcode': 'EX1 2DL' +} +``` + +### **Address Normalization** + +```python +from postal.normalize import normalize_string + +address = "Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL" +normalized = normalize_string(address) + +print(normalized) +``` + +--- + +## 📌 **Troubleshooting** + +### **1️⃣ libpostal Not Found?** + +If you encounter an error like `ModuleNotFoundError: No module named 'postal'`, make sure: + +- You ran `sudo make install` +- Your Python environment recognizes `postal`. Try: + ```bash + pip install postal --no-cache-dir + ``` +- If using a virtual environment (`venv`), activate it before running Python. + +### **2️⃣ Compilation Issues on macOS?** + +If `make` fails, try running: + +```bash +brew reinstall autoconf automake libtool pkg-config +``` + +Then restart the installation process. + +### **3️⃣ Can't Find libpostal Data Directory?** + +Ensure `libpostal_data` exists in the correct directory: + +```bash +ls /usr/local/libpostal_data +``` + +If missing, re-run `./configure` with the correct path. + +--- + +## 🛠 **Uninstallation** + +To remove `libpostal`, run: + +```bash +sudo rm -rf /usr/local/lib/libpostal* +sudo rm -rf /usr/local/include/libpostal* +rm -rf ~/libpostal +pip uninstall postal +``` + +--- + +## 📌 **Additional Resources** + +- [Libpostal GitHub](https://github.com/openvenues/libpostal) +- [Libpostal Python Bindings](https://pypi.org/project/postal/) +- [Homebrew](https://brew.sh/) + +--- + +### 🎉 You’re all set! Now you can use `libpostal` to parse and clean address data efficiently. 🚀 diff --git a/asset_list/requirements.txt b/asset_list/requirements.txt new file mode 100644 index 00000000..d77c8a58 --- /dev/null +++ b/asset_list/requirements.txt @@ -0,0 +1,3 @@ +postal +pandas +usaddress \ No newline at end of file diff --git a/asset_list/tests/test_standardisation.py b/asset_list/tests/test_standardisation.py new file mode 100644 index 00000000..f0e6ce11 --- /dev/null +++ b/asset_list/tests/test_standardisation.py @@ -0,0 +1,9 @@ +from asset_list.AssetList import AssetList + + +def test_address1_extraction(): + example = 'Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL' + + AssetList._extract_address1( + example, + ) diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 06082774..74dc28e0 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -346,10 +346,24 @@ def app(): invalid_property_types_dictionary = ["bedsit", "bed-sit", "bed sit"] - asset_list = AssetList( + self = AssetList( local_filepath=os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, - sheet_name=SHEET_NAME + sheet_name=SHEET_NAME, + address1_colname=ADDRESS1_COLUMN, + postcode_colname=POSTCODE_COLUMN, + full_address_colname=FULLADDRESS_COLUMN, + full_address_cols_to_concat=ADDRESS_COLS_TO_CONCAT, + missing_postcodes_method=MISSING_POSTCODES_METHOD, + landlord_year_built=PROPERTY_YEAR_BUILT, + landlord_uprn=UPRN_COLUMN, + landlord_property_type=PROPERTY_TYPE_COLUMN, + landlord_wall_construction="Wall Construction (EPC)", + landlord_heating_system="Heat Source", + landlord_existing_pv="PV (Y/N)" + ) + self.standardised_asset_list( + # In here, we might want to pass some specific remaps ) # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" From cb0194c3b96f839e5050073eb76e2f23e822c87f Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 19 Feb 2025 14:12:57 +0000 Subject: [PATCH 38/72] working on address extraction --- asset_list/AssetList.py | 119 +++++++++++++--- asset_list/README.md | 172 ----------------------- asset_list/requirements.txt | 7 +- asset_list/tests/test_standardisation.py | 9 +- backend/SearchEpc.py | 14 +- backend/tests/test_search_epc.py | 9 ++ etl/route_march_data_pull/app.py | 2 + 7 files changed, 130 insertions(+), 202 deletions(-) delete mode 100644 asset_list/README.md diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 35da9c3b..1a3f6180 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -17,7 +17,7 @@ class AssetList: "first_two_words", # This method will split on the fist two words, where the separator is a space "first_word", # This method will split on the first word, where the separator is a space "house_number_extraction", # This method will use the NLP model in SearchEPC to extract the housenumber - "address1_extraction" # This method will use the NLP model to extract address1 + # "address1_extraction" # This method will use the NLP model to extract address1 ] STANDARD_PROPERTY_TYPES = [ @@ -29,6 +29,19 @@ class AssetList: "block house", ] + # Standard column Names + STANDARD_ADDRESS_1 = "domna_address_1" + STANDARD_POSTCODE = "domna_postcode" + STANDARD_FULL_ADDRESS = "domna_full_address" + STANDARD_YEAR_BUILT = "domna_year_built" + STANDARD_UPRN = "ordnance_survey_uprn" + STANDARD_PROPERTY_TYPE = "landlord_property_type" + STANDARD_WALL_CONSTRUCTION = "landlord_wall_construction" + STANDARD_HEATING_SYSTEM = "landlord_heating_system" + STANDARD_EXISTING_PV = "landlord_existing_pv" + + DOMNA_PROPERTY_ID = "domna_property_id" + def __init__( self, local_filepath, @@ -36,8 +49,10 @@ class AssetList: address1_colname, postcode_colname, full_address_colname, + landlord_property_id=None, full_address_cols_to_concat=None, missing_postcodes_method=None, + address1_extraction_method=None, landlord_year_built=None, landlord_uprn=None, landlord_property_type=None, @@ -48,14 +63,15 @@ class AssetList: ): self.local_filepath = local_filepath self.sheet_name = sheet_name - self.standardised_asset_list = None # Read in the data self.raw_asset_list = pd.read_excel(local_filepath, header=header, sheet_name=sheet_name) + self.standardised_asset_list = self.raw_asset_list.copy() # We detect the presence of the non-intrusive columns self.non_intrusives_present = True if "CIGA Check Required" in self.raw_asset_list.columns else False # Names of columns + self.landlord_property_id = landlord_property_id self.address1_colname = address1_colname self.postcode_colname = postcode_colname self.full_address_colname = full_address_colname @@ -69,6 +85,7 @@ class AssetList: # parameters for cleaning self.full_address_cols_to_concat = full_address_cols_to_concat self.missing_postcodes_method = missing_postcodes_method + self.address1_extraction_method = address1_extraction_method self.debug_information = { "property_type": None, @@ -77,40 +94,50 @@ class AssetList: "existing_pv": None } - @classmethod - def _extract_address1(cls, asset_list, full_address_col, postcode_col, method="first_two_words"): + def _extract_address1(self, asset_list, full_address_col, postcode_col, method="first_two_words"): - if method not in cls.ADDRESS_1_CLEANING_METHODS: + if method not in self.ADDRESS_1_CLEANING_METHODS: raise ValueError(f"Method {method} for producing address1 not recognized") if method == "first_two_words": - asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ") + asset_list[self.address1_colname] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ") return asset_list if method == "first_word": - asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0] + asset_list[self.address1_colname] = asset_list[full_address_col].str.split(" ").str[0] return asset_list if method == "house_number_extraction": - asset_list["address1_extracted"] = asset_list.apply( + asset_list[self.address1_colname] = asset_list.apply( lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]), axis=1 ) return asset_list - if method == "address1_extraction": - - x = asset_list_df[FULLADDRESS_COLUMN].values[0] - parsed = usaddress.parse(x) - - def extract_address_1(): - - - raise ValueError(f"Method {method} not recognized") + raise ValueError(f"Method {method} not recognized") @staticmethod def _address1_extraction(x): + pass + def create_property_id(self): + """ + This function creates the domna property ID, which is simply a hash of the full address and postcode + We want all figures to be positive + :return: + """ + import sys + self.standardised_asset_list[self.DOMNA_PROPERTY_ID] = ( + self.standardised_asset_list[self.full_address_colname] + self.standardised_asset_list[ + self.postcode_colname] + ).apply(lambda x: hash(x) % 2 ** sys.hash_info.width) + + @staticmethod + def _strip_postcode_from_full_address(full_address, postcode): + cleaned = full_address.replace(postcode, "") + # Remove any trailing commas and spaces + cleaned = cleaned.rstrip(", ").strip(",").strip() + return cleaned def standardise(self): """ @@ -118,15 +145,63 @@ class AssetList: :return: standardised asset list """ - if self.address1_colname is None: - # If we do not have this, we produce it + # Remove rows without a postcode + if self.postcode_colname is not None: + self.standardised_asset_list = self.standardised_asset_list.dropna(subset=[self.postcode_colname]) + # We clean up portential non-breaking spaces, and double spaces + for col in [ + c for c in [self.postcode_colname, self.full_address_colname, self.address1_colname] if + c is not None + ]: + self.standardised_asset_list[col] = self.standardised_asset_list[col].astype(str) + self.standardised_asset_list[col] = self.standardised_asset_list[col].str.replace('\xa0', ' ', regex=False) + self.standardised_asset_list[col] = self.standardised_asset_list[col].str.replace(' ', ' ', regex=False) + + if self.address1_colname is None: + if self.address1_extraction_method is None: + raise ValueError("Missing address 1 - please specify an extraction method") + self.address1_colname = self.STANDARD_ADDRESS_1 + # If we do not have this, we produce it + self.standardised_asset_list = self._extract_address1( + asset_list=self.standardised_asset_list, + full_address_col=self.full_address_colname, + postcode_col=self.postcode_colname, + method=self.address1_extraction_method + ) + + if self.full_address_colname is None: + if not self.full_address_cols_to_concat: + raise ValueError("Missing full address - please specify columns to concatenate") + self.full_address_colname = self.STANDARD_FULL_ADDRESS + self.standardised_asset_list[self.full_address_colname] = ( + self.standardised_asset_list[self.full_address_cols_to_concat].apply(lambda x: ", ".join(x), axis=1) + ) + else: + + # Make sure to strip the postcode out of the full address + self.standardised_asset_list[self.full_address_colname] = self.standardised_asset_list.apply( + lambda x: self._strip_postcode_from_full_address( + full_address=x[self.full_address_colname], + postcode=x[self.postcode_colname] + ), + axis=1 + ) + + # We create the domna property id + self.create_property_id() # We keep just the columns we care about and will work through the various columns and standardise - self.standardised_asset_list = self.raw_asset_list[ + self.standardised_asset_list = self.standardised_asset_list[ [ - self.address1_colname, self.postcode_colname, self.full_address_colname, - self.landlord_year_built, self.landlord_uprn, self.landlord_property_type + self.landlord_property_id, + self.DOMNA_PROPERTY_ID, + self.address1_colname, + self.postcode_colname, + self.full_address_colname, + self.landlord_year_built, + self.landlord_uprn, + self.landlord_property_type, ] ] diff --git a/asset_list/README.md b/asset_list/README.md deleted file mode 100644 index 1bf734a4..00000000 --- a/asset_list/README.md +++ /dev/null @@ -1,172 +0,0 @@ -# libpostal Installation Guide for macOS M1 - -## Overview - -`libpostal` is a fast, open-source address parsing and normalization library, designed for global addresses. This guide -provides detailed steps to install `libpostal` on macOS with Apple Silicon (M1/M2) and use it with Python. - ---- - -## 📌 Prerequisites - -Before installing `libpostal`, ensure you have the necessary dependencies installed. - -### **1️⃣ Install Required Dependencies** - -Open a terminal and run: - -```bash -brew install curl autoconf automake libtool pkg-config -``` - -### **2️⃣ Clone the libpostal Repository** - -```bash -git clone https://github.com/openvenues/libpostal.git -cd libpostal -``` - -### **3️⃣ Run Bootstrap Script** - -```bash -./bootstrap.sh -``` - -### **4️⃣ Configure the Build (Important for M1 Macs)** - -Since M1 chips do not support SSE2 natively, you **must** disable SSE2 for compatibility. - -```bash -./configure --disable-sse2 --datadir=/usr/local/libpostal_data -``` - -*(You can replace `/usr/local/libpostal_data` with another directory that has a few GB of space.)* - -### **5️⃣ Compile and Install** - -```bash -make -j$(sysctl -n hw.ncpu) -sudo make install -``` - -### **6️⃣ Install Python Bindings** - -Once `libpostal` is installed, install the Python package: - -```bash -pip install postal -``` - ---- - -## ✅ **Verify Installation** - -To check if `libpostal` was installed successfully, run: - -```bash -python -c "import postal; print(postal.parser.parse('23 Clifton Hill, Newtown, Exeter, EX1 2DL'))" -``` - -**Expected Output:** - -``` -[('23', 'house_number'), ('Clifton Hill', 'road'), ('Newtown', 'city'), ('Exeter', 'city'), ('EX1 2DL', 'postcode')] -``` - ---- - -## 📌 **Usage Example in Python** - -### **Address Parsing** - -```python -from postal.parser import parse - -address = "23 Clifton Hill, Newtown, Exeter, EX1 2DL" -parsed_address = dict(parse(address)) - -print(parsed_address) -``` - -**Expected Output:** - -```python -{ - 'house_number': '23', - 'road': 'Clifton Hill', - 'city': 'Newtown', - 'city': 'Exeter', - 'postcode': 'EX1 2DL' -} -``` - -### **Address Normalization** - -```python -from postal.normalize import normalize_string - -address = "Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL" -normalized = normalize_string(address) - -print(normalized) -``` - ---- - -## 📌 **Troubleshooting** - -### **1️⃣ libpostal Not Found?** - -If you encounter an error like `ModuleNotFoundError: No module named 'postal'`, make sure: - -- You ran `sudo make install` -- Your Python environment recognizes `postal`. Try: - ```bash - pip install postal --no-cache-dir - ``` -- If using a virtual environment (`venv`), activate it before running Python. - -### **2️⃣ Compilation Issues on macOS?** - -If `make` fails, try running: - -```bash -brew reinstall autoconf automake libtool pkg-config -``` - -Then restart the installation process. - -### **3️⃣ Can't Find libpostal Data Directory?** - -Ensure `libpostal_data` exists in the correct directory: - -```bash -ls /usr/local/libpostal_data -``` - -If missing, re-run `./configure` with the correct path. - ---- - -## 🛠 **Uninstallation** - -To remove `libpostal`, run: - -```bash -sudo rm -rf /usr/local/lib/libpostal* -sudo rm -rf /usr/local/include/libpostal* -rm -rf ~/libpostal -pip uninstall postal -``` - ---- - -## 📌 **Additional Resources** - -- [Libpostal GitHub](https://github.com/openvenues/libpostal) -- [Libpostal Python Bindings](https://pypi.org/project/postal/) -- [Homebrew](https://brew.sh/) - ---- - -### 🎉 You’re all set! Now you can use `libpostal` to parse and clean address data efficiently. 🚀 diff --git a/asset_list/requirements.txt b/asset_list/requirements.txt index d77c8a58..d6d64471 100644 --- a/asset_list/requirements.txt +++ b/asset_list/requirements.txt @@ -1,3 +1,8 @@ postal pandas -usaddress \ No newline at end of file +usaddress +pydantic-settings==2.6.0 +epc-api-python==1.0.2 +fuzzywuzzy +boto3 +openpyxl \ No newline at end of file diff --git a/asset_list/tests/test_standardisation.py b/asset_list/tests/test_standardisation.py index f0e6ce11..1a083bbc 100644 --- a/asset_list/tests/test_standardisation.py +++ b/asset_list/tests/test_standardisation.py @@ -1,9 +1,12 @@ from asset_list.AssetList import AssetList +from backend.SearchEpc import + def test_address1_extraction(): example = 'Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL' - AssetList._extract_address1( - example, - ) + # AssetList._extract_address1( + # example, + # ) + pass diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index e8a9dfaa..79a041ec 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -208,9 +208,14 @@ class SearchEpc: try: # Updated regex to catch house numbers including alphanumeric ones pattern = r'(?i)(?:flat|apartment)\s*(\d+\w*)|^\s*(\d+\w*)' - match = re.search(pattern, address) - if match: - return next(g for g in match.groups() if g is not None) + match1 = re.search(pattern, address) + if match1: + return next(g for g in match1.groups() if g is not None) + + pattern2 = r'(?i)(flat|apartment)\s*([a-zA-Z]?\d+[a-zA-Z]?)' + match2 = re.search(pattern2, address) + if match2: + return match2.group(2) parsed = usaddress.parse(address) # First, try to get the 'OccupancyIdentifier' if 'OccupancyType' is detected @@ -221,7 +226,8 @@ class SearchEpc: continue if part == postcode.split(" ")[1]: continue - return part # This assumes the first 'OccupancyIdentifier' after 'OccupancyType' is the primary + return part.rstrip( + ",") # This assumes the first 'OccupancyIdentifier' after 'OccupancyType' is the primary # number # Fallback to 'AddressNumber' if no 'OccupancyIdentifier' is found diff --git a/backend/tests/test_search_epc.py b/backend/tests/test_search_epc.py index 3b2e2a5b..562585ad 100644 --- a/backend/tests/test_search_epc.py +++ b/backend/tests/test_search_epc.py @@ -48,3 +48,12 @@ class TestSearchEpcIntegration: assert epc_searcher.newest_epc["lmk-key"] == lmk_key assert epc_searcher.newest_epc["uprn"] == uprn assert len(epc_searcher.older_epcs) == n_old_epcs + + def test_search_housenumber(self): + eg1 = 'Flat A11, Mortimer House, Grendon Road, Exeter' + res1 = SearchEpc.get_house_number(eg1, None) + assert res1 == "A11" + + eg2 = 'Flat A9, Mortimer House, Grendon Road, Exeter, EX1 2NL' + res2 = SearchEpc.get_house_number(eg2, None) + assert res2 == "A9" diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 74dc28e0..fcf11765 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -352,9 +352,11 @@ def app(): sheet_name=SHEET_NAME, address1_colname=ADDRESS1_COLUMN, postcode_colname=POSTCODE_COLUMN, + landlord_property_id="UPRN", full_address_colname=FULLADDRESS_COLUMN, full_address_cols_to_concat=ADDRESS_COLS_TO_CONCAT, missing_postcodes_method=MISSING_POSTCODES_METHOD, + address1_extraction_method=ADDRESS1_METHOD, landlord_year_built=PROPERTY_YEAR_BUILT, landlord_uprn=UPRN_COLUMN, landlord_property_type=PROPERTY_TYPE_COLUMN, From 0a643d80adb412ea4069664cc12efaf9e71fad42 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 19 Feb 2025 14:21:29 +0000 Subject: [PATCH 39/72] building out multi-unit flagging --- asset_list/AssetList.py | 16 ++++++++++++++-- asset_list/tests/test_standardisation.py | 11 ++--------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 1a3f6180..fde24fe2 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -1,5 +1,4 @@ -import os -import usaddress +import re import pandas as pd from utils.logger import setup_logger from backend.SearchEpc import SearchEpc @@ -42,6 +41,9 @@ class AssetList: DOMNA_PROPERTY_ID = "domna_property_id" + # Regular expression for identifying if the address might point to multiple units + MULTI_UNIT_REGEX = re.compile(r'\b([A-Za-z0-9]+)-([A-Za-z0-9]+)\b') + def __init__( self, local_filepath, @@ -139,6 +141,14 @@ class AssetList: cleaned = cleaned.rstrip(", ").strip(",").strip() return cleaned + @classmethod + def _identify_multi_address(cls, address): + # We check if the address is comma separated + if "," in address: + address1_section = address.split(",")[0] + # We look for string in the form (x-y) + return bool(cls.MULTI_UNIT_REGEX.search(address1_section)) + def standardise(self): """ This function is used to standardise the asset list @@ -205,4 +215,6 @@ class AssetList: ] ] + # We idenfiy addresses which are likely to be multi-addresses (i.g are rooms x-y) + raise NotImplementedError diff --git a/asset_list/tests/test_standardisation.py b/asset_list/tests/test_standardisation.py index 1a083bbc..b6d9a391 100644 --- a/asset_list/tests/test_standardisation.py +++ b/asset_list/tests/test_standardisation.py @@ -1,12 +1,5 @@ from asset_list.AssetList import AssetList -from backend.SearchEpc import - -def test_address1_extraction(): - example = 'Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL' - - # AssetList._extract_address1( - # example, - # ) - pass +def test_multi_unit_address_flagging(): + assert AssetList._identify_multi_address('Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL') From ecf8e46c65ae7e09725258bcb578690d1156bf14 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 19 Feb 2025 22:12:29 +0000 Subject: [PATCH 40/72] getting asset list class live --- .idea/terraform.xml | 6 + asset_list/AssetList.py | 321 +++++++++++++++++++++++-- asset_list/app.py | 1 + asset_list/mappings/exising_pv.py | 8 + asset_list/mappings/heating_systems.py | 46 ++++ asset_list/mappings/property_type.py | 16 ++ asset_list/mappings/walls.py | 38 +++ asset_list/requirements.txt | 4 +- etl/route_march_data_pull/app.py | 5 +- 9 files changed, 420 insertions(+), 25 deletions(-) create mode 100644 .idea/terraform.xml create mode 100644 asset_list/app.py create mode 100644 asset_list/mappings/exising_pv.py create mode 100644 asset_list/mappings/heating_systems.py create mode 100644 asset_list/mappings/property_type.py create mode 100644 asset_list/mappings/walls.py diff --git a/.idea/terraform.xml b/.idea/terraform.xml new file mode 100644 index 00000000..cd46a3d3 --- /dev/null +++ b/.idea/terraform.xml @@ -0,0 +1,6 @@ + + + + + \ No newline at end of file diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index fde24fe2..e61cc89b 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -1,16 +1,200 @@ +import os import re +from datetime import datetime +from openai import OpenAI +import tiktoken +import numpy as np import pandas as pd +from fuzzywuzzy import process from utils.logger import setup_logger from backend.SearchEpc import SearchEpc +import asset_list.mappings.property_type as property_type_mappings +import asset_list.mappings.walls as walls_mappings +import asset_list.mappings.heating_systems as heating_mappings +import asset_list.mappings.exising_pv as existing_pv_mappings logger = setup_logger() +# OpenAI API Key (set this in your environment variables for security) +OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") + + +class DataRemapper: + def __init__(self, standard_values, standard_map=None, max_tokens=1000): + """ + Initialize the remapper with standard values and a predefined mapping. + + :param standard_values: Set of allowed standardized values. + :param standard_map: Dictionary of common remappings {raw_value: standard_value}. + """ + self.standard_values = {v.lower() for v in standard_values} # Normalize to lowercase + self.standard_map = {k.lower(): v.lower() for k, v in (standard_map or {}).items()} # Predefined mappings + self.fuzzy_threshold = 90 # Adjust fuzzy matching sensitivity + self.ai_model = "gpt-4-turbo" # Use gpt-3.5-turbo for cheaper processing + + # Tokenizer for counting tokens + self.tokenizer = tiktoken.encoding_for_model(self.ai_model) + + # Track token usage and remap dictionary + self.total_tokens_used = 0 + self.total_cost = 0 + self.remap_dict = {} # {original_value: standardized_value} + self.max_tokens = 1000 # Limit for OpenAI API + + # Memoization for AI calls + self.ai_cache = {} # {tuple(unmapped_values): {original_value: standardized_value}} + # Capture the reponse for debugging + self.ai_response = None + + # OpenAI pricing (as of Feb 2024) + self.pricing = { + "gpt-4-turbo": {"input": 0.01 / 1000, "output": 0.03 / 1000}, + "gpt-3.5-turbo": {"input": 0.0015 / 1000, "output": 0.002 / 1000}, + } + + self.openai_client = OpenAI(api_key=OPENAI_API_KEY) + + @staticmethod + def clean_string(text): + """Basic text cleaning: remove extra spaces, punctuation, and normalize case.""" + if not isinstance(text, str): + return None + text = text.strip().lower() + text = re.sub(r'[^\w\s]', '', text) # Remove punctuation + return text + + def fuzzy_match(self, text): + """Use fuzzy matching to find the closest standard value.""" + match, score = process.extractOne(text, self.standard_values) if text else (None, 0) + return match if score >= self.fuzzy_threshold else None + + def count_tokens(self, text): + """Estimate the number of tokens in a given text.""" + return len(self.tokenizer.encode(text)) if text else 0 + + def ai_standardize(self, unmapped_values): + """Call OpenAI API **once** for all unmapped values to minimize cost, with memoization.""" + if not unmapped_values: + return {} + + unmapped_tuple = tuple(sorted(unmapped_values)) # Ensure consistency for memoization + if unmapped_tuple in self.ai_cache: + return self.ai_cache[unmapped_tuple] # Return memoized result + + prompt = f""" + You are an expert in data classification. Standardize each of these values into one of the categories: + {list(self.standard_values)}. + + Return only a JSON dictionary where: + - The keys are the original values. + - The values are the standardized ones. + + Strictly return JSON **without markdown formatting** or extra text. + + Example Output: + {{ + "BLKHOUS": "block house", + "BEDSIT": "bedsit" + }} + + Values to standardize: + {unmapped_values} + """ + + # Count input tokens + input_tokens = self.count_tokens(prompt) + if input_tokens > self.max_tokens: + raise ValueError("Input tokens exceed the maximum limit.") + + response = self.openai_client.chat.completions.create( + model=self.ai_model, + messages=[{"role": "user", "content": prompt}], + max_tokens=self.max_tokens, + temperature=0.1, + ) + + output_text = response.choices[0].message.content.strip() + output_tokens = self.count_tokens(output_text) # Count output tokens + + # Track total token usage + self.total_tokens_used += input_tokens + output_tokens + + # Estimate cost + input_cost = input_tokens * self.pricing[self.ai_model]["input"] + output_cost = output_tokens * self.pricing[self.ai_model]["output"] + self.total_cost += input_cost + output_cost + + try: + # Parse response as dictionary + mapping = eval(output_text) # OpenAI should return a valid dictionary + except: + mapping = {val: "unknown" for val in unmapped_values} # Fallback + + # Memoize the AI response + self.ai_cache[unmapped_tuple] = mapping + # We store the raw AI response for debugging + logger.debug(f"AI Response: {mapping}") + self.ai_response = output_text + + return mapping + + def standardize_list(self, values_to_remap): + """ + Standardizes a list of values and returns a dictionary {original_value: standardized_value}. + + :param values_to_remap: List of raw values to standardize. + :return: Dictionary {original_value: standardized_value}. + """ + unique_values = set(values_to_remap) # Process only unique values + + unmapped_values = [] + for value in unique_values: + if pd.isna(value): # Handle NaN values + self.remap_dict[value] = "unknown" + continue + + cleaned_value = self.clean_string(value) + + # Rule-Based Check (Predefined Mapping) + if cleaned_value in self.standard_map: + self.remap_dict[value] = self.standard_map[cleaned_value] + continue + + # Exact Match in Standard Values + if cleaned_value in self.standard_values: + self.remap_dict[value] = cleaned_value + continue + + # Fuzzy Matching + fuzzy_match = self.fuzzy_match(cleaned_value) + if fuzzy_match: + self.remap_dict[value] = fuzzy_match + continue + + # Capture anything that wasn't mapped + unmapped_values.append(value) + + # AI Model - remap anything unmapped (batch request) + ai_mapping = self.ai_standardize(unmapped_values) + self.remap_dict.update(ai_mapping) + + return self.remap_dict + + def report_usage(self): + """Prints a summary of token usage and cost.""" + print(f"\n🔹 Total Tokens Used: {self.total_tokens_used}") + print(f"💰 Estimated Cost: ${self.total_cost:.4f}") + class AssetList: """ This class is used to standardise asset lists so that we can process the core information in a consistent manner. """ + DATETIME_REMAP = { + "Pre 1900": datetime(year=1899, month=12, day=31), + } + # These are the accepted methods we have for cleaning the address1 column ADDRESS_1_CLEANING_METHODS = [ "first_two_words", # This method will split on the fist two words, where the separator is a space @@ -19,15 +203,6 @@ class AssetList: # "address1_extraction" # This method will use the NLP model to extract address1 ] - STANDARD_PROPERTY_TYPES = [ - "house", - "flat", - "bungalow", - "maisonette", - "park home", - "block house", - ] - # Standard column Names STANDARD_ADDRESS_1 = "domna_address_1" STANDARD_POSTCODE = "domna_postcode" @@ -44,6 +219,15 @@ class AssetList: # Regular expression for identifying if the address might point to multiple units MULTI_UNIT_REGEX = re.compile(r'\b([A-Za-z0-9]+)-([A-Za-z0-9]+)\b') + # List of columns relating to the non-intrusive data + NON_INTRUSIVES_COLNAMES = [ + "Archetype", "Construction", "Insulated", "Material", "CIGA Check Required", + "PV, ACCESS ISSUE, SEE NOTES", "OFF GAS - ROOF ORIENTATION", + "Any further surveyor notes", 'Surveyors Name' + ] + + #### Mapping for wall construction + def __init__( self, local_filepath, @@ -96,6 +280,8 @@ class AssetList: "existing_pv": None } + self.variable_mappings = {} + def _extract_address1(self, asset_list, full_address_col, postcode_col, method="first_two_words"): if method not in self.ADDRESS_1_CLEANING_METHODS: @@ -149,7 +335,7 @@ class AssetList: # We look for string in the form (x-y) return bool(cls.MULTI_UNIT_REGEX.search(address1_section)) - def standardise(self): + def init_standardise(self): """ This function is used to standardise the asset list :return: standardised asset list @@ -202,19 +388,110 @@ class AssetList: self.create_property_id() # We keep just the columns we care about and will work through the various columns and standardise - self.standardised_asset_list = self.standardised_asset_list[ - [ - self.landlord_property_id, - self.DOMNA_PROPERTY_ID, - self.address1_colname, - self.postcode_colname, - self.full_address_colname, - self.landlord_year_built, - self.landlord_uprn, - self.landlord_property_type, - ] + variables = [ + self.landlord_property_id, + self.DOMNA_PROPERTY_ID, + self.address1_colname, + self.postcode_colname, + self.full_address_colname, + self.landlord_uprn, + self.landlord_property_type, + self.landlord_year_built, + self.landlord_wall_construction, + self.landlord_heating_system, + self.landlord_existing_pv ] + rename = {} + + if self.non_intrusives_present: + variables += self.NON_INTRUSIVES_COLNAMES + rename = { + **rename, + **dict( + zip(self.NON_INTRUSIVES_COLNAMES, ["non-intrusives: " + c for c in self.NON_INTRUSIVES_COLNAMES]) + ) + } + + self.standardised_asset_list = self.standardised_asset_list[variables].rename( + columns=rename + ) # We idenfiy addresses which are likely to be multi-addresses (i.g are rooms x-y) + self.standardised_asset_list["is_multi_address"] = self.standardised_asset_list[ + self.full_address_colname + ].apply(lambda x: self._identify_multi_address(x)) - raise NotImplementedError + # We handle cleaning for walls, in the instance that the landlord provides us with EPC data and + # we see instances of "average thermal transmittance" in the description + self.standardised_asset_list[self.landlord_wall_construction] = np.where( + self.standardised_asset_list[self.landlord_wall_construction].str.lower().str.contains( + "average thermal transmittance" + ), + "new build - average thermal transmittance", + self.standardised_asset_list[self.landlord_wall_construction] + ) + + # Clear our build year column + + # We attempt to process the year built column + if self.landlord_year_built is not None: + # We check if we have a datetime + if isinstance(self.standardised_asset_list[self.landlord_year_built].iloc[0], datetime): + # We treat any string columns - with common values we see + self.standardised_asset_list[self.landlord_year_built] = ( + self.standardised_asset_list[self.landlord_year_built].replace(self.DATETIME_REMAP) + ) + + self.standardised_asset_list[self.landlord_year_built] = pd.to_datetime( + self.standardised_asset_list[self.landlord_year_built] + ) + # Convert this to year + self.standardised_asset_list[self.landlord_year_built] = ( + self.standardised_asset_list[self.landlord_year_built].dt.year + ) + else: + raise NotImplementedError("Year built column must be a datetime - implement me") + + # We now create standard lookups + to_remap = { + self.landlord_property_type: { + "standard_values": property_type_mappings.STANDARD_PROPERTY_TYPES, + "standard_map": property_type_mappings.PROPERTY_MAPPING + }, + self.landlord_wall_construction: { + "standard_values": walls_mappings.STANDARD_WALL_CONSTRUCTIONS, + "standard_map": walls_mappings.WALL_CONSTRUCTION_MAPPINGS + }, + self.landlord_heating_system: { + "standard_values": heating_mappings.STANDARD_HEATING_SYSTEMS, + "standard_map": heating_mappings.HEATING_MAPPINGS + }, + self.landlord_existing_pv: { + "standard_values": existing_pv_mappings.STANDARD_EXISTING_PV, + "standard_map": existing_pv_mappings.EXISTING_PV_MAPPINGS + } + } + + for variable, config in to_remap.items(): + logger.info("Standardising variable: %s", variable) + values_to_remap = self.standardised_asset_list[variable].unique() + # We want to map this to our standardised list of property types we're interested in + remapper = DataRemapper(standard_values=config["standard_values"], standard_map=config["standard_map"]) + remap_dictionary = remapper.standardize_list(values_to_remap=values_to_remap.tolist()) + self.variable_mappings[variable] = remap_dictionary + + # We now print out the variable mappings, which can be reviewed by the user, before the final standardised + # asset list is returned + + def apply_standardiation(self, override_empty_mappings=False): + """ + This function applies the standardisation to the asset list + :param override_empty_mappings: If true, will override the check for empty mappings. This is only relevant + if there are no categories which need remapping which is highly unlikely + :return: + """ + if not self.variable_mappings and not override_empty_mappings: + raise ValueError("Please run init_standardise first") + + def create_lookup_mappings(self): + pass diff --git a/asset_list/app.py b/asset_list/app.py new file mode 100644 index 00000000..21b405d8 --- /dev/null +++ b/asset_list/app.py @@ -0,0 +1 @@ +import os diff --git a/asset_list/mappings/exising_pv.py b/asset_list/mappings/exising_pv.py new file mode 100644 index 00000000..1e45bd83 --- /dev/null +++ b/asset_list/mappings/exising_pv.py @@ -0,0 +1,8 @@ +STANDARD_EXISTING_PV = { + "already has PV", "no PV", "unknown" +} + +EXISTING_PV_MAPPINGS = { + "NO": "no PV", + "YES": "already has PV", +} diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py new file mode 100644 index 00000000..4fce39ab --- /dev/null +++ b/asset_list/mappings/heating_systems.py @@ -0,0 +1,46 @@ +STANDARD_HEATING_SYSTEMS = { + "gas combi boiler", + "electric storage heaters", + "district heating", + "gas condensing boiler", + "oil boiler", + "gas condensing combi", + "air source heat pump", + "boiler - other fuel", + "ground source heat pump", + "electric radiators", + "other", + "electric boiler", + "unknown", + "communal gas boiler", +} + +HEATING_MAPPINGS = { + "Combi - GAS": "gas combi boiler", + "E7 Storage Heaters": "electric storage heaters", + "District heating system": "district heating", + "Condensing Boiler - GAS": "gas condensing boiler", + "Boiler Oil/other": "oil boiler", + "Condensing Combi - Gas": "gas condensing combi", + "Air Source Source Heat Pump": "air source heat pump", + "Biomass Boiler": "boiler - other fuel", + "Ground Source Heat Pump": "ground source heat pump", + "Electric Oil filled radiators": "electric radiators", + "Solid Fuel": "other", + "LPG Boiler": "boiler - other fuel", + "Electric Boiler": "electric boiler", + "No data": "unknown", + "Boiler Communal/Commercial - GAS": "communal gas boiler", + "Eco Electric Radiators": "electric radiators", + "Gas fire": "other", + "Backboiler - Solid fuel": "other", +} + +# array(['Combi - GAS', 'E7 Storage Heaters', 'District heating system', +# 'Condensing Boiler - GAS', 'Boiler Oil/other', +# 'Condensing Combi - Gas', 'Air Source Source Heat Pump', +# 'Biomass Boiler', 'Ground Source Heat Pump', +# 'Electric Oil filled radiators', 'Solid Fuel', 'LPG Boiler', +# 'Electric Boiler', 'No data', 'Boiler Communal/Commercial - GAS', +# 'Eco Electric Radiators', 'Gas fire', 'Backboiler - Solid fuel'], +# dtype=object) diff --git a/asset_list/mappings/property_type.py b/asset_list/mappings/property_type.py new file mode 100644 index 00000000..bcad9ede --- /dev/null +++ b/asset_list/mappings/property_type.py @@ -0,0 +1,16 @@ +# These are the standard categories for property types +STANDARD_PROPERTY_TYPES = { + "house", "flat", "maisonette", "bungalow", "park home", "block house", "bedsit", "coach house", + "unknown", "other" +} + +# This is a basic mapping that we use to map values that we've seen commonly to standard values +PROPERTY_MAPPING = { + "HOUSE": "house", + "FLAT": "flat", + "MAISONET": "maisonette", + "BUNGALOW": "bungalow", + "BLKHOUS": "block house", + "BEDSIT": "bedsit", + "COACHSE": "coach house", +} diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py new file mode 100644 index 00000000..7dec7d12 --- /dev/null +++ b/asset_list/mappings/walls.py @@ -0,0 +1,38 @@ +STANDARD_WALL_CONSTRUCTIONS = { + "uninsulated cavity", "filled cavity", "partial insulated cavity", "timber frame", "solid brick", + "system built", "granite or whinstone", "other", "unknown", "sandstone or limestone", "cob", + "new build - average thermal transmittance", +} + +WALL_CONSTRUCTION_MAPPINGS = { + "New Build - Average Thermal Transmittance": "new build - average thermal transmittance", + 'Average thermal transmittance 0.25 W/m?K': 'unknown', + 'Cavity wall, as built, insulated (assumed)': 'filled cavity', + 'Average thermal transmittance 0.31 W/m?K': 'unknown', + 'Cavity wall, as built, no insulation (assumed)': 'uninsulated cavity', + 'Average thermal transmittance 0.30 W/m?K': 'unknown', 'Average thermal transmittance 0.28 W/m-¦K': 'unknown', + 'Average thermal transmittance 0.25 W/m-¦K': 'unknown', 'Average thermal transmittance 0.21 W/m-¦K': 'unknown', + 'Average thermal transmittance 0.20 W/m-¦K': 'unknown', 'Average thermal transmittance 0.29 W/m?K': 'unknown', + 'Average thermal transmittance 0.16 W/m?K': 'unknown', + 'Average thermal transmittance 0.27 W/m²K': 'unknown', + 'Average thermal transmittance 0.15 W/m-¦K': 'unknown', 'Average thermal transmittance 0.23 W/m-¦K': 'unknown', + 'Average thermal transmittance 0.18 W/m?K': 'unknown', + 'Granite or whin, with internal insulation': 'granite or whinstone', + 'Average thermal transmittance 0.22 W/m-¦K': 'unknown', 'Average thermal transmittance 0.24 W/m?K': 'unknown', + 'Average thermal transmittance 0.16 W/m-¦K': 'unknown', 'Average thermal transmittance 0.35 W/m?K': 'unknown', + 'Average thermal transmittance 0.26 W/m-¦K': 'unknown', 'Average thermal transmittance 0.62 W/m?K': 'unknown', + 'Average thermal transmittance 0.64 W/m?K': 'unknown', 'Average thermal transmittance 0.61 W/m?K': 'unknown', + 'Sandstone or limestone, as built, no insulation (assumed)': 'sandstone or limestone', + 'Average thermal transmittance 0.33 W/m?K': 'unknown', 'Cavity wall,': 'unknown', + 'Cavity wall, as built, partial insulation (assumed)': 'partial insulated cavity', + 'Average thermal transmittance 0.29 W/m-¦K': 'unknown', 'Average thermal transmittance 0.32 W/m-¦K': 'unknown', + 'Average thermal transmittance 0.19 W/m-¦K': 'unknown', 'Average thermal transmittance 0.27 W/m?K': 'unknown', + 'Average thermal transmittance 0.22 W/m?K': 'unknown', 'Average thermal transmittance 0.38 W/m?K': 'unknown', + 'Average thermal transmittance 0.26 W/m?K': 'unknown', 'Average thermal transmittance 0.27 W/m-¦K': 'unknown', + 'Average thermal transmittance 0.18 W/m-¦K': 'unknown', 'Average thermal transmittance = 0.27 W/m?K': 'unknown', + 'Cavity wall, with external insulation': 'filled cavity', 'Average thermal transmittance 0.21 W/m?K': 'unknown', + 'Average thermal transmittance 0.23 W/m?K': 'unknown', 'Average thermal transmittance 0.20 W/m?K': 'unknown', + 'Average thermal transmittance 0.32 W/m?K': 'unknown', 'Average thermal transmittance 0.24 W/m-¦K': 'unknown', + 'Cavity wall, with internal insulation': 'filled cavity', + 'Average thermal transmittance 0.17 W/m-¦K': 'unknown', 'Average thermal transmittance 0.28 W/m?K': 'unknown' +} diff --git a/asset_list/requirements.txt b/asset_list/requirements.txt index d6d64471..0c16c43a 100644 --- a/asset_list/requirements.txt +++ b/asset_list/requirements.txt @@ -5,4 +5,6 @@ pydantic-settings==2.6.0 epc-api-python==1.0.2 fuzzywuzzy boto3 -openpyxl \ No newline at end of file +openpyxl +openai +tiktoken \ No newline at end of file diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index fcf11765..ca5195d6 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -364,10 +364,11 @@ def app(): landlord_heating_system="Heat Source", landlord_existing_pv="PV (Y/N)" ) - self.standardised_asset_list( - # In here, we might want to pass some specific remaps + self.init_standardise( ) + self.apply_transformations() + # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" # DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx" # SHEET_NAME = "Sheet1" From 978deb286bc411a563631e81685319a38ef9061e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 19 Feb 2025 22:32:05 +0000 Subject: [PATCH 41/72] debugging remapper --- asset_list/AssetList.py | 19 ++++++++++---- asset_list/mappings/exising_pv.py | 4 +++ asset_list/mappings/heating_systems.py | 17 ++++++------- asset_list/mappings/property_type.py | 2 ++ asset_list/mappings/walls.py | 34 +++++++++++++++++++++++++- etl/route_march_data_pull/app.py | 5 ++-- 6 files changed, 63 insertions(+), 18 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index e61cc89b..8f905a33 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -27,8 +27,8 @@ class DataRemapper: :param standard_values: Set of allowed standardized values. :param standard_map: Dictionary of common remappings {raw_value: standard_value}. """ - self.standard_values = {v.lower() for v in standard_values} # Normalize to lowercase - self.standard_map = {k.lower(): v.lower() for k, v in (standard_map or {}).items()} # Predefined mappings + self.standard_values = standard_values + self.standard_map = standard_map self.fuzzy_threshold = 90 # Adjust fuzzy matching sensitivity self.ai_model = "gpt-4-turbo" # Use gpt-3.5-turbo for cheaper processing @@ -39,7 +39,7 @@ class DataRemapper: self.total_tokens_used = 0 self.total_cost = 0 self.remap_dict = {} # {original_value: standardized_value} - self.max_tokens = 1000 # Limit for OpenAI API + self.max_tokens = max_tokens # Limit for OpenAI API # Memoization for AI calls self.ai_cache = {} # {tuple(unmapped_values): {original_value: standardized_value}} @@ -61,6 +61,8 @@ class DataRemapper: return None text = text.strip().lower() text = re.sub(r'[^\w\s]', '', text) # Remove punctuation + # Replace double strings + text = re.sub(r'\s+', ' ', text) return text def fuzzy_match(self, text): @@ -106,6 +108,7 @@ class DataRemapper: if input_tokens > self.max_tokens: raise ValueError("Input tokens exceed the maximum limit.") + logger.info("Calling OpenAI API for standardization...") response = self.openai_client.chat.completions.create( model=self.ai_model, messages=[{"role": "user", "content": prompt}], @@ -156,8 +159,14 @@ class DataRemapper: cleaned_value = self.clean_string(value) # Rule-Based Check (Predefined Mapping) - if cleaned_value in self.standard_map: - self.remap_dict[value] = self.standard_map[cleaned_value] + if cleaned_value in self.standard_map or value in self.standard_map: + self.remap_dict[value] = ( + self.standard_map[cleaned_value] if cleaned_value in self.standard_map else self.standard_map[value] + ) + continue + + if value.lower() in self.standard_map: + self.remap_dict[value] = self.standard_map[value.lower()] continue # Exact Match in Standard Values diff --git a/asset_list/mappings/exising_pv.py b/asset_list/mappings/exising_pv.py index 1e45bd83..06e77bba 100644 --- a/asset_list/mappings/exising_pv.py +++ b/asset_list/mappings/exising_pv.py @@ -5,4 +5,8 @@ STANDARD_EXISTING_PV = { EXISTING_PV_MAPPINGS = { "NO": "no PV", "YES": "already has PV", + "no": "no PV", + "yes": "already has PV", + True: "already has PV", + False: "no PV", } diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py index 4fce39ab..2fbdff70 100644 --- a/asset_list/mappings/heating_systems.py +++ b/asset_list/mappings/heating_systems.py @@ -34,13 +34,12 @@ HEATING_MAPPINGS = { "Eco Electric Radiators": "electric radiators", "Gas fire": "other", "Backboiler - Solid fuel": "other", + 'combi - gas': 'gas combi boiler', 'e7 storage heaters': 'electric storage heaters', + 'district heating system': 'district heating', 'condensing boiler - gas': 'gas condensing boiler', + 'boiler oil/other': 'oil boiler', 'condensing combi - gas': 'gas condensing combi', + 'air source source heat pump': 'air source heat pump', 'biomass boiler': 'boiler - other fuel', + 'ground source heat pump': 'ground source heat pump', 'electric oil filled radiators': 'electric radiators', + 'solid fuel': 'other', 'lpg boiler': 'boiler - other fuel', 'electric boiler': 'electric boiler', + 'no data': 'unknown', 'boiler communal/commercial - gas': 'communal gas boiler', + 'eco electric radiators': 'electric radiators', 'gas fire': 'other', 'backboiler - solid fuel': 'other', } - -# array(['Combi - GAS', 'E7 Storage Heaters', 'District heating system', -# 'Condensing Boiler - GAS', 'Boiler Oil/other', -# 'Condensing Combi - Gas', 'Air Source Source Heat Pump', -# 'Biomass Boiler', 'Ground Source Heat Pump', -# 'Electric Oil filled radiators', 'Solid Fuel', 'LPG Boiler', -# 'Electric Boiler', 'No data', 'Boiler Communal/Commercial - GAS', -# 'Eco Electric Radiators', 'Gas fire', 'Backboiler - Solid fuel'], -# dtype=object) diff --git a/asset_list/mappings/property_type.py b/asset_list/mappings/property_type.py index bcad9ede..ec569123 100644 --- a/asset_list/mappings/property_type.py +++ b/asset_list/mappings/property_type.py @@ -11,6 +11,8 @@ PROPERTY_MAPPING = { "MAISONET": "maisonette", "BUNGALOW": "bungalow", "BLKHOUS": "block house", + "blkhous": "block house", "BEDSIT": "bedsit", "COACHSE": "coach house", + "coachse": "coach house", } diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py index 7dec7d12..33db1fef 100644 --- a/asset_list/mappings/walls.py +++ b/asset_list/mappings/walls.py @@ -1,3 +1,5 @@ +from asset_list.AssetList import DataRemapper + STANDARD_WALL_CONSTRUCTIONS = { "uninsulated cavity", "filled cavity", "partial insulated cavity", "timber frame", "solid brick", "system built", "granite or whinstone", "other", "unknown", "sandstone or limestone", "cob", @@ -18,6 +20,7 @@ WALL_CONSTRUCTION_MAPPINGS = { 'Average thermal transmittance 0.15 W/m-¦K': 'unknown', 'Average thermal transmittance 0.23 W/m-¦K': 'unknown', 'Average thermal transmittance 0.18 W/m?K': 'unknown', 'Granite or whin, with internal insulation': 'granite or whinstone', + "Granite or whinstone, as built, insulated (assumed)": "granite or whinstone", 'Average thermal transmittance 0.22 W/m-¦K': 'unknown', 'Average thermal transmittance 0.24 W/m?K': 'unknown', 'Average thermal transmittance 0.16 W/m-¦K': 'unknown', 'Average thermal transmittance 0.35 W/m?K': 'unknown', 'Average thermal transmittance 0.26 W/m-¦K': 'unknown', 'Average thermal transmittance 0.62 W/m?K': 'unknown', @@ -34,5 +37,34 @@ WALL_CONSTRUCTION_MAPPINGS = { 'Average thermal transmittance 0.23 W/m?K': 'unknown', 'Average thermal transmittance 0.20 W/m?K': 'unknown', 'Average thermal transmittance 0.32 W/m?K': 'unknown', 'Average thermal transmittance 0.24 W/m-¦K': 'unknown', 'Cavity wall, with internal insulation': 'filled cavity', - 'Average thermal transmittance 0.17 W/m-¦K': 'unknown', 'Average thermal transmittance 0.28 W/m?K': 'unknown' + 'Average thermal transmittance 0.17 W/m-¦K': 'unknown', 'Average thermal transmittance 0.28 W/m?K': 'unknown', + 'new build - average thermal transmittance': 'new build - average thermal transmittance', + 'average thermal transmittance 0.25 w/m?k': 'unknown', + 'cavity wall, as built, insulated (assumed)': 'filled cavity', + 'average thermal transmittance 0.31 w/m?k': 'unknown', + 'cavity wall, as built, no insulation (assumed)': 'uninsulated cavity', + 'average thermal transmittance 0.30 w/m?k': 'unknown', 'average thermal transmittance 0.28 w/m-¦k': 'unknown', + 'average thermal transmittance 0.25 w/m-¦k': 'unknown', 'average thermal transmittance 0.21 w/m-¦k': 'unknown', + 'average thermal transmittance 0.20 w/m-¦k': 'unknown', 'average thermal transmittance 0.29 w/m?k': 'unknown', + 'average thermal transmittance 0.16 w/m?k': 'unknown', 'average thermal transmittance 0.27 w/m²k': 'unknown', + 'average thermal transmittance 0.15 w/m-¦k': 'unknown', 'average thermal transmittance 0.23 w/m-¦k': 'unknown', + 'average thermal transmittance 0.18 w/m?k': 'unknown', + 'granite or whin, with internal insulation': 'granite or whinstone', + 'average thermal transmittance 0.22 w/m-¦k': 'unknown', 'average thermal transmittance 0.24 w/m?k': 'unknown', + 'average thermal transmittance 0.16 w/m-¦k': 'unknown', 'average thermal transmittance 0.35 w/m?k': 'unknown', + 'average thermal transmittance 0.26 w/m-¦k': 'unknown', 'average thermal transmittance 0.62 w/m?k': 'unknown', + 'average thermal transmittance 0.64 w/m?k': 'unknown', 'average thermal transmittance 0.61 w/m?k': 'unknown', + 'sandstone or limestone, as built, no insulation (assumed)': 'sandstone or limestone', + 'average thermal transmittance 0.33 w/m?k': 'unknown', 'cavity wall,': 'unknown', + 'cavity wall, as built, partial insulation (assumed)': 'partial insulated cavity', + 'average thermal transmittance 0.29 w/m-¦k': 'unknown', 'average thermal transmittance 0.32 w/m-¦k': 'unknown', + 'average thermal transmittance 0.19 w/m-¦k': 'unknown', 'average thermal transmittance 0.27 w/m?k': 'unknown', + 'average thermal transmittance 0.22 w/m?k': 'unknown', 'average thermal transmittance 0.38 w/m?k': 'unknown', + 'average thermal transmittance 0.26 w/m?k': 'unknown', 'average thermal transmittance 0.27 w/m-¦k': 'unknown', + 'average thermal transmittance 0.18 w/m-¦k': 'unknown', 'average thermal transmittance = 0.27 w/m?k': 'unknown', + 'cavity wall, with external insulation': 'filled cavity', 'average thermal transmittance 0.21 w/m?k': 'unknown', + 'average thermal transmittance 0.23 w/m?k': 'unknown', 'average thermal transmittance 0.20 w/m?k': 'unknown', + 'average thermal transmittance 0.32 w/m?k': 'unknown', 'average thermal transmittance 0.24 w/m-¦k': 'unknown', + 'cavity wall, with internal insulation': 'filled cavity', 'average thermal transmittance 0.17 w/m-¦k': 'unknown', + 'average thermal transmittance 0.28 w/m?k': 'unknown', } diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index ca5195d6..1289fb09 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -346,7 +346,7 @@ def app(): invalid_property_types_dictionary = ["bedsit", "bed-sit", "bed sit"] - self = AssetList( + asset_list = AssetList( local_filepath=os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME, @@ -364,8 +364,7 @@ def app(): landlord_heating_system="Heat Source", landlord_existing_pv="PV (Y/N)" ) - self.init_standardise( - ) + asset_list.init_standardise() self.apply_transformations() From 776285dd1592e037f9345a4396d83db671dedd03 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 19 Feb 2025 22:35:21 +0000 Subject: [PATCH 42/72] added map printing --- asset_list/AssetList.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 8f905a33..87402924 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -1,8 +1,9 @@ import os import re +import tiktoken +from pprint import pprint from datetime import datetime from openai import OpenAI -import tiktoken import numpy as np import pandas as pd from fuzzywuzzy import process @@ -491,6 +492,12 @@ class AssetList: # We now print out the variable mappings, which can be reviewed by the user, before the final standardised # asset list is returned + for variable, mapping in self.variable_mappings.items(): + pprint(f"Variable: {variable}") + pprint(mapping) + # Print a space + print("\n") + pprint("=======================================") def apply_standardiation(self, override_empty_mappings=False): """ From 75e7c13a29ed98059a99e54245b72cebd9c52f48 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 19 Feb 2025 22:51:48 +0000 Subject: [PATCH 43/72] modifying creation of ids --- asset_list/AssetList.py | 37 ++++++++++++++++++++++++++++---- etl/route_march_data_pull/app.py | 13 +++-------- 2 files changed, 36 insertions(+), 14 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 87402924..b153b624 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -1,3 +1,4 @@ +import hashlib import os import re import tiktoken @@ -324,11 +325,24 @@ class AssetList: We want all figures to be positive :return: """ - import sys + + # We'll remove punctuation and whitespace from the address, before hashing to produce an ID + + def _make_hash(value): + """Generates a stable SHA256 hash suffix and appends it to a cleaned version of the value.""" + # Normalize and remove special characters for cleaner ID + cleaned_value = re.sub(r"[^\w\s-]", "", value).replace(" ", "_").lower() + + # Generate SHA-256 hash and truncate it + short_hash = hashlib.sha256(value.encode()).hexdigest()[:12] + + return f"{cleaned_value}-{short_hash}" + + # Apply transformation self.standardised_asset_list[self.DOMNA_PROPERTY_ID] = ( - self.standardised_asset_list[self.full_address_colname] + self.standardised_asset_list[ - self.postcode_colname] - ).apply(lambda x: hash(x) % 2 ** sys.hash_info.width) + self.standardised_asset_list[self.full_address_colname] + + self.standardised_asset_list[self.postcode_colname] + ).str.strip().str.replace(r"[^\w\s]", "", regex=True).str.replace(" ", "").str.lower().apply(_make_hash) @staticmethod def _strip_postcode_from_full_address(full_address, postcode): @@ -509,5 +523,20 @@ class AssetList: if not self.variable_mappings and not override_empty_mappings: raise ValueError("Please run init_standardise first") + logger.info("Applying standardisation to asset list") + + for variable, mapping in self.variable_mappings.items(): + self.standardised_asset_list[variable] = self.standardised_asset_list[variable].map(mapping) + + if self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum(): + # Drop the dupes + pprint( + f"There are {self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum()} duplicated " + f"addresses - dropping" + ) + self.standardised_asset_list = self.standardised_asset_list[ + ~self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated() + ] + def create_lookup_mappings(self): pass diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 1289fb09..54ae2280 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -344,7 +344,8 @@ def app(): HAS_NON_INTRUSIVES = True PROPERTY_TYPE_COLUMN = "Location type" # This will be used to identify and remove bedsits - invalid_property_types_dictionary = ["bedsit", "bed-sit", "bed sit"] + # Maps addresses to uprn in problematic cases + MANUAL_UPRN_MAP = {} asset_list = AssetList( local_filepath=os.path.join(DATA_FOLDER, DATA_FILENAME), @@ -366,7 +367,7 @@ def app(): ) asset_list.init_standardise() - self.apply_transformations() + asset_list.apply_standardiation() # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" # DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx" @@ -382,9 +383,6 @@ def app(): # # If we have the non-intrusives data, this should be true # HAS_NON_INTRUSIVES = True - # Maps addresses to uprn in problematic cases - MANUAL_UPRN_MAP = {} - asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME) if MISSING_POSTCODES_METHOD is not None: @@ -464,11 +462,6 @@ def app(): # We check for duplicated addresses asset_list["deduper"] = asset_list[FULLADDRESS_COLUMN] + asset_list[POSTCODE_COLUMN] - if asset_list["deduper"].duplicated().sum(): - # Drop the dupes - print(f"There are {asset_list['deduper'].duplicated().sum()} duplicated addresses - dropping") - asset_list = asset_list[~asset_list["deduper"].duplicated()] - asset_list = asset_list.drop(columns=["deduper"]) # We chunk up this data into 5000 rows at a time # Create the chunks directory From fe6de36782bc3d413f7813ee54ad151e11bc929d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 20 Feb 2025 07:46:52 +0000 Subject: [PATCH 44/72] creating new maps --- etl/route_march_data_pull/app.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 54ae2280..d520895d 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -6,6 +6,10 @@ import numpy as np from tqdm import tqdm from datetime import datetime from asset_list.AssetList import AssetList +from asset_list.mappings.property_type import PROPERTY_MAPPING +from asset_list.mappings.walls import WALL_CONSTRUCTION_MAPPINGS +from asset_list.mappings.heating_systems import HEATING_MAPPINGS +from asset_list.mappings.exising_pv import EXISTING_PV_MAPPINGS from dotenv import load_dotenv from backend.SearchEpc import SearchEpc @@ -367,6 +371,21 @@ def app(): ) asset_list.init_standardise() + # We produce the new maps, which can be saved for future useage + + new_property_type_map = PROPERTY_MAPPING.copy().update( + asset_list.variable_mappings[asset_list.landlord_property_type] + ) + new_wall_map = WALL_CONSTRUCTION_MAPPINGS.copy().update( + asset_list.variable_mappings[asset_list.landlord_wall_construction] + ) + new_heating_map = HEATING_MAPPINGS.copy().update( + asset_list.variable_mappings[asset_list.landlord_heating_system] + ) + new_existing_pv_map = EXISTING_PV_MAPPINGS.copy().update( + asset_list.variable_mappings[asset_list.landlord_existing_pv] + ) + asset_list.apply_standardiation() # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" From 63dbda005d63d590b1d2e1b156d15d125a67c746 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 20 Feb 2025 07:57:47 +0000 Subject: [PATCH 45/72] completing full rename --- asset_list/AssetList.py | 51 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index b153b624..8379cc2a 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -218,8 +218,9 @@ class AssetList: STANDARD_ADDRESS_1 = "domna_address_1" STANDARD_POSTCODE = "domna_postcode" STANDARD_FULL_ADDRESS = "domna_full_address" - STANDARD_YEAR_BUILT = "domna_year_built" + STANDARD_YEAR_BUILT = "landlord_year_built" STANDARD_UPRN = "ordnance_survey_uprn" + STANDARD_LANDLORD_PROPERTY_ID = "landlord_property_id" STANDARD_PROPERTY_TYPE = "landlord_property_type" STANDARD_WALL_CONSTRUCTION = "landlord_wall_construction" STANDARD_HEATING_SYSTEM = "landlord_heating_system" @@ -293,6 +294,8 @@ class AssetList: self.variable_mappings = {} + self.rename_map = {} + def _extract_address1(self, asset_list, full_address_col, postcode_col, method="first_two_words"): if method not in self.ADDRESS_1_CLEANING_METHODS: @@ -359,6 +362,25 @@ class AssetList: # We look for string in the form (x-y) return bool(cls.MULTI_UNIT_REGEX.search(address1_section)) + @staticmethod + def _convert_uprn(x): + """ + Used to convert UPRNS to integer strings + :param x: uprn to convert + :return: converted uprn + """ + + if pd.isnull(x): + return x + + # check if numeric + if np.isreal(x): + return str(int(x)) + + if str(x).isdigit(): + return str(int(x)) + return x + def init_standardise(self): """ This function is used to standardise the asset list @@ -411,6 +433,12 @@ class AssetList: # We create the domna property id self.create_property_id() + # Clean up the UPRN column, if the landlord has provided them + if self.landlord_uprn is not None: + self.standardised_asset_list[self.landlord_uprn] = ( + self.standardised_asset_list[self.landlord_uprn].apply(self._convert_uprn) + ) + # We keep just the columns we care about and will work through the various columns and standardise variables = [ self.landlord_property_id, @@ -425,7 +453,21 @@ class AssetList: self.landlord_heating_system, self.landlord_existing_pv ] - rename = {} + # Keep just non-null variables (e.g landlord may not provide uprn + variables = [v for v in variables if v is not None] + rename = { + self.landlord_property_id: self.STANDARD_LANDLORD_PROPERTY_ID, + self.address1_colname: self.STANDARD_ADDRESS_1, + self.postcode_colname: self.STANDARD_POSTCODE, + self.full_address_colname: self.STANDARD_FULL_ADDRESS, + self.landlord_uprn: self.STANDARD_UPRN, + self.landlord_property_type: self.STANDARD_PROPERTY_TYPE, + self.landlord_year_built: self.STANDARD_YEAR_BUILT, + self.landlord_wall_construction: self.STANDARD_WALL_CONSTRUCTION, + self.landlord_heating_system: self.STANDARD_HEATING_SYSTEM, + self.landlord_existing_pv: self.STANDARD_EXISTING_PV + } + rename = {k: v for k, v in rename.items() if k is not None} if self.non_intrusives_present: variables += self.NON_INTRUSIVES_COLNAMES @@ -538,5 +580,10 @@ class AssetList: ~self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated() ] + # Apply renames to our standard names + self.standardised_asset_list = self.standardised_asset_list.rename( + columns=self.rename_map + ) + def create_lookup_mappings(self): pass From 47ad0e8275ce218b0cd44de6342ff619d83a0d81 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 20 Feb 2025 08:21:59 +0000 Subject: [PATCH 46/72] refactoring get_data methodology --- asset_list/AssetList.py | 23 +++-- etl/route_march_data_pull/app.py | 149 +++++++++---------------------- 2 files changed, 53 insertions(+), 119 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 8379cc2a..14dce093 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -295,6 +295,7 @@ class AssetList: self.variable_mappings = {} self.rename_map = {} + self.keep_variables = [] def _extract_address1(self, asset_list, full_address_col, postcode_col, method="first_two_words"): @@ -454,8 +455,8 @@ class AssetList: self.landlord_existing_pv ] # Keep just non-null variables (e.g landlord may not provide uprn - variables = [v for v in variables if v is not None] - rename = { + self.keep_variables = [v for v in variables if v is not None] + self.rename_map = { self.landlord_property_id: self.STANDARD_LANDLORD_PROPERTY_ID, self.address1_colname: self.STANDARD_ADDRESS_1, self.postcode_colname: self.STANDARD_POSTCODE, @@ -467,21 +468,17 @@ class AssetList: self.landlord_heating_system: self.STANDARD_HEATING_SYSTEM, self.landlord_existing_pv: self.STANDARD_EXISTING_PV } - rename = {k: v for k, v in rename.items() if k is not None} + self.rename_map = {k: v for k, v in self.rename_map.items() if k is not None} if self.non_intrusives_present: - variables += self.NON_INTRUSIVES_COLNAMES - rename = { - **rename, + self.keep_variables += self.NON_INTRUSIVES_COLNAMES + self.rename_map = { + **self.rename_map, **dict( zip(self.NON_INTRUSIVES_COLNAMES, ["non-intrusives: " + c for c in self.NON_INTRUSIVES_COLNAMES]) ) } - self.standardised_asset_list = self.standardised_asset_list[variables].rename( - columns=rename - ) - # We idenfiy addresses which are likely to be multi-addresses (i.g are rooms x-y) self.standardised_asset_list["is_multi_address"] = self.standardised_asset_list[ self.full_address_colname @@ -498,10 +495,9 @@ class AssetList: ) # Clear our build year column - # We attempt to process the year built column if self.landlord_year_built is not None: - # We check if we have a datetime + # We check if we have a datetime - year built has not been renamed if isinstance(self.standardised_asset_list[self.landlord_year_built].iloc[0], datetime): # We treat any string columns - with common values we see self.standardised_asset_list[self.landlord_year_built] = ( @@ -581,7 +577,8 @@ class AssetList: ] # Apply renames to our standard names - self.standardised_asset_list = self.standardised_asset_list.rename( + # Perform final variable selection and renaming: + self.standardised_asset_list = self.standardised_asset_list[self.keep_variables].rename( columns=self.rename_map ) diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index d520895d..83e5e0ca 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -1,10 +1,10 @@ import os import time -from BaseUtility import Definitions +import json import pandas as pd import numpy as np from tqdm import tqdm -from datetime import datetime +from BaseUtility import Definitions from asset_list.AssetList import AssetList from asset_list.mappings.property_type import PROPERTY_MAPPING from asset_list.mappings.walls import WALL_CONSTRUCTION_MAPPINGS @@ -31,8 +31,8 @@ EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") def get_data( - asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map, uprn_column=None, - epc_api_only=False + asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map, + uprn_column=None, epc_api_only=False, row_id_name="row_id" ): epc_data = [] errors = [] @@ -103,12 +103,12 @@ def get_data( searcher.find_property(skip_os=True) if searcher.newest_epc is None: - no_epc.append(home["row_id"]) + no_epc.append(home[row_id_name]) continue if epc_api_only: epc = { - "row_id": home["row_id"], + row_id_name: home[row_id_name], **searcher.newest_epc.copy() } @@ -144,7 +144,7 @@ def get_data( time.sleep(np.random.uniform(0.1, 1)) epc = { - "row_id": home["row_id"], + row_id_name: home[row_id_name], **searcher.newest_epc.copy(), "recommendations": property_recommendations["rows"], "find_my_epc_data": find_epc_data, @@ -152,7 +152,7 @@ def get_data( epc_data.append(epc) except Exception as e: - errors.append(home["row_id"]) + errors.append(home[row_id_name]) time.sleep(5) return epc_data, errors, no_epc @@ -402,113 +402,48 @@ def app(): # # If we have the non-intrusives data, this should be true # HAS_NON_INTRUSIVES = True - asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME) - - if MISSING_POSTCODES_METHOD is not None: - if MISSING_POSTCODES_METHOD == "last_two_words": - # Replace any double spaces - asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].str.replace(' ', ' ', regex=False) - asset_list["Postcode"] = np.where( - pd.isnull(asset_list["Postcode"]), - asset_list[FULLADDRESS_COLUMN].str.split(" ").str[-2:].str.join(" "), - asset_list["Postcode"] - ) - else: - raise ValueError(f"Method {MISSING_POSTCODES_METHOD} not recognized") - - asset_list = asset_list[~pd.isnull(asset_list[POSTCODE_COLUMN])].reset_index() - asset_list["row_id"] = asset_list.index - - # We clean up portential non-breaking spaces, and double spaces - for col in [c for c in [POSTCODE_COLUMN, FULLADDRESS_COLUMN, ADDRESS1_COLUMN] if c is not None]: - asset_list[col] = asset_list[col].astype(str) - asset_list[col] = asset_list[col].str.replace('\xa0', ' ', regex=False) - asset_list[col] = asset_list[col].str.replace(' ', ' ', regex=False) - asset_list[col] = asset_list[col].str.strip() - - if ADDRESS1_COLUMN is None: - ADDRESS1_COLUMN = "address1_extracted" - asset_list = extract_address1( - asset_list=asset_list, - full_address_col=FULLADDRESS_COLUMN, - postcode_col=POSTCODE_COLUMN, - method=ADDRESS1_METHOD - ) - - if FULLADDRESS_COLUMN is None: - FULLADDRESS_COLUMN = "fulladdress_extracted" - # We concatenate the columns in ADDRESS_COLS_TO_CONCAT, on commas - # Sometimes, some of the columns are empty, so we need to remove them - asset_list[FULLADDRESS_COLUMN] = asset_list[ADDRESS_COLS_TO_CONCAT].apply( - lambda x: ", ".join([y for y in x if not pd.isnull(y)]), axis=1 - ) - - # We clean up portential non-breaking spaces, and double spaces - asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].astype(str) - asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].str.replace('\xa0', ' ', regex=False) - asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].str.replace(' ', ' ', regex=False) - - if UPRN_COLUMN is not None: - # Check if it's numeric and if so, make sure it's an integer - def convert_uprn(x): - - if pd.isnull(x): - return x - - # check if numeric - if np.isreal(x): - return str(int(x)) - - if str(x).isdigit(): - return str(int(x)) - return x - - asset_list[UPRN_COLUMN] = asset_list[UPRN_COLUMN].apply(convert_uprn) - - # We attempt to process the year built column - if PROPERTY_YEAR_BUILT is not None: - # We check if we have a datetime - if isinstance(asset_list[PROPERTY_YEAR_BUILT].iloc[0], datetime): - # We treat any string columns - with common values we see - datetime_remap = { - "Pre 1900": datetime(year=1899, month=12, day=31), - } - asset_list[PROPERTY_YEAR_BUILT] = asset_list[PROPERTY_YEAR_BUILT].replace(datetime_remap) - - asset_list[PROPERTY_YEAR_BUILT] = pd.to_datetime(asset_list[PROPERTY_YEAR_BUILT]) - # Convert this to year - asset_list[PROPERTY_YEAR_BUILT] = asset_list[PROPERTY_YEAR_BUILT].dt.year - - # We check for duplicated addresses - asset_list["deduper"] = asset_list[FULLADDRESS_COLUMN] + asset_list[POSTCODE_COLUMN] + ### We retrieve the EPC data # We chunk up this data into 5000 rows at a time # Create the chunks directory - if not os.path.exists(os.path.join(DATA_FOLDER, "Chunks")): - os.makedirs(os.path.join(DATA_FOLDER, "Chunks")) - chunk_size = 5000 - errors = [] - no_epc = [] + force_retrieve_data = False skip = None # Used to skip already completed chunks - for i in range(0, len(asset_list), chunk_size): + chunk_size = 5000 + filename = "Chunk {i}.csv" + download_folder = os.path.join(DATA_FOLDER, "Chunks") + if not os.path.exists(download_folder): + os.makedirs(download_folder) + + chunk_indexes = list(range(0, len(asset_list.standardised_asset_list), chunk_size)) + downloaded_files = {filename.format(i=i) for i in chunk_indexes} + + # We check if we have files associated to these files already and if we do, and we do not want to force the + # fetching of the data, we skip + folder_contents = os.listdir(download_folder) + if all(x in folder_contents for x in downloaded_files): + skip = max(chunk_indexes) + + for i in range(0, len(asset_list.standardised_asset_list), chunk_size): print(f"Processing chunk {i} to {i + chunk_size}") - if skip is not None: + if skip is not None and not force_retrieve_data: if i <= skip: continue - chunk = asset_list[i:i + chunk_size] + chunk = asset_list.standardised_asset_list[i:i + chunk_size] epc_data_chunk, errors_chunk, no_epc_chunk = get_data( asset_list=chunk, - fulladdress_column=FULLADDRESS_COLUMN, - address1_column=ADDRESS1_COLUMN, - postcode_column=POSTCODE_COLUMN, + row_id_name=asset_list.DOMNA_PROPERTY_ID, + fulladdress_column=asset_list.STANDARD_FULL_ADDRESS, + address1_column=asset_list.STANDARD_ADDRESS_1, + postcode_column=asset_list.STANDARD_POSTCODE, manual_uprn_map=MANUAL_UPRN_MAP, - uprn_column=UPRN_COLUMN + uprn_column=asset_list.STANDARD_UPRN ) # We now retrieve any failed properties - chunk_failed = chunk[chunk["row_id"].isin(errors)] + chunk_failed = chunk[chunk[asset_list.DOMNA_PROPERTY_ID].isin(errors_chunk)] epc_data_failed, _, _ = get_data( asset_list=chunk_failed, + row_id_name=asset_list.DOMNA_PROPERTY_ID, fulladdress_column=FULLADDRESS_COLUMN, address1_column=ADDRESS1_COLUMN, postcode_column=POSTCODE_COLUMN, @@ -517,20 +452,22 @@ def app(): ) epc_data_chunk.extend(epc_data_failed) - errors.extend(errors_chunk) - no_epc.extend(no_epc_chunk) # Append the failed data to the main data # Store the chunk locally as a csv pd.DataFrame(epc_data_chunk).to_csv(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i}.csv"), index=False) + # Store the errors and no-data locally + with open(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i} errors.json"), "w") as f: + json.dump(errors_chunk, f) + + with open(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i} nodata.csv"), "w") as f: + json.dump(no_epc_chunk, f) # We read in and concatenate the created created chunks - chunks_folder = os.path.join(DATA_FOLDER, "Chunks") # List the contents - chunk_files = os.listdir(chunks_folder) epc_data = [] - for file in chunk_files: - csv_data = pd.read_csv(os.path.join(chunks_folder, file)) + for file in downloaded_files: + csv_data = pd.read_csv(os.path.join(download_folder, file)) # We need to convert the recommendations back to a list csv_data["recommendations"] = csv_data["recommendations"].apply(eval) csv_data["find_my_epc_data"] = csv_data["find_my_epc_data"].apply(eval) From 591ce5445839780ea64db5376eb0457d27da3d34 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 20 Feb 2025 08:26:09 +0000 Subject: [PATCH 47/72] hndling case where landlord uprn and landlord property id are the sames --- asset_list/AssetList.py | 6 ++++++ etl/route_march_data_pull/app.py | 9 ++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 14dce093..5e8ff29c 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -297,6 +297,12 @@ class AssetList: self.rename_map = {} self.keep_variables = [] + # Finally, we handle the case where the landlord's property ID is actually the OS UPRN + if self.landlord_uprn == self.landlord_property_id: + self.raw_asset_list[self.STANDARD_UPRN] = self.raw_asset_list[self.landlord_uprn].copy() + # Update the reference to landlord UPRn + self.landlord_uprn = self.STANDARD_UPRN + def _extract_address1(self, asset_list, full_address_col, postcode_col, method="first_two_words"): if method not in self.ADDRESS_1_CLEANING_METHODS: diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 83e5e0ca..4bf9fe3a 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -511,6 +511,7 @@ def app(): find_my_epc_data["Solar photovoltaics"] = False # Retrieve just the data we need + epc_df = epc_df[ [ "row_id", @@ -527,21 +528,23 @@ def app(): "walls-description", "floor-description", "transaction-type", - # New fields needed "secondheat-description", "total-floor-area", "construction-age-band", "floor-height", "number-habitable-rooms", "mainheat-description", - # - "energy-consumption-current", # kwh/m2 + 'mainheatcont-description', + "energy-consumption-current", "photo-supply", ] ].rename( columns={"address1": "Address1 on EPC", "address": "Address on EPC", "postcode": "Postcode on EPC"} ) + asset_list.merge_data(epc_df) + asset_list.insert_ + asset_list = asset_list.merge( epc_df, how="left", From 4a6802a5a24715ca0f047a70b680d6dc484cd7b4 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 20 Feb 2025 08:27:35 +0000 Subject: [PATCH 48/72] fixed bug to reference standardised data when copying uprn instead of raw --- asset_list/AssetList.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 5e8ff29c..86b1bf87 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -299,7 +299,7 @@ class AssetList: # Finally, we handle the case where the landlord's property ID is actually the OS UPRN if self.landlord_uprn == self.landlord_property_id: - self.raw_asset_list[self.STANDARD_UPRN] = self.raw_asset_list[self.landlord_uprn].copy() + self.standardised_asset_list[self.STANDARD_UPRN] = self.standardised_asset_list[self.landlord_uprn].copy() # Update the reference to landlord UPRn self.landlord_uprn = self.STANDARD_UPRN From 37cc43adb1b331d267c724faaf804afaa0b7f2fc Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 20 Feb 2025 08:39:29 +0000 Subject: [PATCH 49/72] refactoring creation of epc dataset --- asset_list/AssetList.py | 42 +++++++++++++++++ etl/route_march_data_pull/app.py | 77 +++++++------------------------- 2 files changed, 59 insertions(+), 60 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 86b1bf87..88425e6d 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -202,6 +202,33 @@ class AssetList: This class is used to standardise asset lists so that we can process the core information in a consistent manner. """ + EPC_API_DATA_NAMES = { + "uprn": "epc_os_uprn", + "address1": "epc_address1", + "address": "epc_address", + "postcode": "epc_postcode", + "inspection-date": "epc_inspection_date", + "current-energy-efficiency": "epc_sap_score_on_register", + "current-energy-rating": "epc_rating_on_register", + "property-type": "epc_property_type", + "built-form": "epc_archetype", + "total-floor-area": "epc_total_floor_area", + "construction-age-band": "epc_age_band", + "floor-height": "epc_floor_height", + "number-habitable-rooms": "epc_number_habitable_rooms", + "walls-description": "epc_wall_construction", + "roof-description": "epc_roof_construction", + "floor-description": "epc_floor_construction", + "mainheat-description": "epc_heating_type", + 'mainheatcont-description': "epc_heating_controls", + "secondheat-description": "epc_secondary_heating", + "transaction-type": "epc_reason", + "energy-consumption-current": "epc_heat_demand", + } + FIND_EPC_DATA_NAMES = { + + } + DATETIME_REMAP = { "Pre 1900": datetime(year=1899, month=12, day=31), } @@ -590,3 +617,18 @@ class AssetList: def create_lookup_mappings(self): pass + + def merge_data(self, df: pd.DataFrame): + """ + Used to insert data into the standardised asset list, based on the domna property id + :return: + """ + if self.DOMNA_PROPERTY_ID not in df.columns: + raise ValueError(f"Dataframe must contain the column {self.DOMNA_PROPERTY_ID}") + + if df[self.DOMNA_PROPERTY_ID].duplicated().sum(): + raise ValueError(f"{self.DOMNA_PROPERTY_ID} contains duplicated IDs") + + self.standardised_asset_list = self.standardised_asset_list.merge( + df, how="left", on=self.DOMNA_PROPERTY_ID + ) diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 4bf9fe3a..2e66c4aa 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -474,20 +474,22 @@ def app(): epc_data.append(csv_data) epc_df = pd.concat(epc_data) + # TODO: TEMP!!! + epc_df = epc_df.rename(columns={"row_id": asset_list.DOMNA_PROPERTY_ID}) # We expand out the recommendations - recommendations_df = epc_df[["row_id", "recommendations"]] + recommendations_df = epc_df[[asset_list.DOMNA_PROPERTY_ID, "recommendations"]] unique_recommendations = set() for _, row in recommendations_df.iterrows(): unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]]) - columns = ["row_id"] + list(unique_recommendations) + columns = [asset_list.DOMNA_PROPERTY_ID] + list(unique_recommendations) transformed_data = [] for _, row in recommendations_df.iterrows(): # Initialize a dictionary for this row with False for all recommendations row_data = {col: False for col in columns} - row_data["row_id"] = row["row_id"] + row_data[asset_list.DOMNA_PROPERTY_ID] = row[asset_list.DOMNA_PROPERTY_ID] # Set True for each recommendation present in this row for rec in row["recommendations"]: @@ -500,10 +502,11 @@ def app(): transformed_df = pd.DataFrame(transformed_data) # At the moment, we're only using a limited set of columns - let's jut keep cavity wall insulation # recommendations - transformed_df = transformed_df[["row_id", "Cavity wall insulation"]] + transformed_df = transformed_df[[asset_list.DOMNA_PROPERTY_ID, "Cavity wall insulation"]] # Get the find my epc data - find_my_epc_data = epc_df[["row_id", "find_my_epc_data"]].drop(columns=["find_my_epc_data"]).join( + find_my_epc_data = epc_df[[asset_list.DOMNA_PROPERTY_ID, "find_my_epc_data"]].drop( + columns=["find_my_epc_data"]).join( pd.json_normalize(epc_df["find_my_epc_data"]) ) # We check if we get the solar pv column: @@ -513,46 +516,15 @@ def app(): # Retrieve just the data we need epc_df = epc_df[ - [ - "row_id", - "uprn", - "address1", - "address", - "postcode", - "property-type", - "built-form", - "inspection-date", - "current-energy-rating", - "current-energy-efficiency", - "roof-description", - "walls-description", - "floor-description", - "transaction-type", - "secondheat-description", - "total-floor-area", - "construction-age-band", - "floor-height", - "number-habitable-rooms", - "mainheat-description", - 'mainheatcont-description', - "energy-consumption-current", - "photo-supply", - ] - ].rename( - columns={"address1": "Address1 on EPC", "address": "Address on EPC", "postcode": "Postcode on EPC"} + [asset_list.DOMNA_PROPERTY_ID] + list(asset_list.EPC_API_DATA_NAMES.keys()) + ].rename( + columns=asset_list.EPC_API_DATA_NAMES ) - asset_list.merge_data(epc_df) - asset_list.insert_ - - asset_list = asset_list.merge( - epc_df, - how="left", - on="row_id" - ).merge( + epc_df = epc_df.merge( find_my_epc_data[ [ - "row_id", "heating_text", "hot_water_text", 'Assessor’s name', + asset_list.DOMNA_PROPERTY_ID, "heating_text", "hot_water_text", 'Assessor’s name', "Assessor's Telephone", "Assessor's Email", "Accreditation scheme", "Assessor’s ID", "Solar photovoltaics" ] @@ -564,31 +536,16 @@ def app(): } ), how="left", - on="row_id" + on=asset_list.DOMNA_PROPERTY_ID ) + asset_list.merge_data(epc_df) + asset_list["Has Solar PV"] = asset_list["Has Solar PV"] | ~asset_list["photo-supply"].isin(["0.0", 0, None, ""]) asset_list = asset_list.drop(columns=["photo-supply"]) # Rename the columns - asset_list = asset_list.rename(columns={ - "inspection-date": "Date of last EPC", - "current-energy-efficiency": "SAP score on register", - "current-energy-rating": "EPC rating on register", - "property-type": "Property Type", - "built-form": "Archetype - EPC", - "total-floor-area": "Property Floor Area", - "construction-age-band": "Property Age Band", - "floor-height": "Property Floor Height", - "number-habitable-rooms": "Number of Habitable Rooms", - "walls-description": "Wall Construction", - "roof-description": "Roof Construction", - "floor-description": "Floor Construction", - "mainheat-description": "Heating Type", - "secondheat-description": "Secondary Heating", - "transaction-type": "Reason for last EPC", - "energy-consumption-current": "Heat Demand (kWh/m2)", - }) + asset_list = asset_list asset_list["Estimated Number of Floors"] = asset_list.apply( lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull( From ecc9d9954073858685ef1877d574fc5fc73606b2 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 20 Feb 2025 08:45:15 +0000 Subject: [PATCH 50/72] major refactor of handling of epc data and starting to set up extract_attributes --- asset_list/AssetList.py | 23 ++++++++++++++++++----- etl/route_march_data_pull/app.py | 17 ++++------------- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 88425e6d..4ca4c2b8 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -226,7 +226,14 @@ class AssetList: "energy-consumption-current": "epc_heat_demand", } FIND_EPC_DATA_NAMES = { - + "heating_text": "epc_estiamted_heating_kwh", + "hot_water_text": "epc_estimated_hotwater_kwh", + 'Assessor’s name': "epc_assessor_name", + "Assessor's Telephone": "epc_assessor_telephone", + "Assessor's Email": "epc_assessor_email", + "Accreditation scheme": "epc_assessor_accreditation", + "Assessor’s ID": "epc_assessor_id", + "Solar photovoltaics": "epc_solar_pv" } DATETIME_REMAP = { @@ -265,7 +272,8 @@ class AssetList: "Any further surveyor notes", 'Surveyors Name' ] - #### Mapping for wall construction + # Attributes - these are columns that we produce, calcualted based on other pieces of data + ATTRIBUTE_HAS_SOLAR = "attribute_has_solar" def __init__( self, @@ -615,9 +623,6 @@ class AssetList: columns=self.rename_map ) - def create_lookup_mappings(self): - pass - def merge_data(self, df: pd.DataFrame): """ Used to insert data into the standardised asset list, based on the domna property id @@ -632,3 +637,11 @@ class AssetList: self.standardised_asset_list = self.standardised_asset_list.merge( df, how="left", on=self.DOMNA_PROPERTY_ID ) + + def extract_attributes(self): + # Used to extracty the typical attributes that we use to identify viable work + + self.standardised_asset_list[self.ATTRIBUTE_HAS_SOLAR] = ( + self.standardised_asset_list[self.FIND_EPC_DATA_NAMES["Solar photovoltaics"]] | + ~self.standardised_asset_list[self.EPC_API_DATA_NAMES["photo-supply"]].isin(["0.0", 0, None, ""]) + ) diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 2e66c4aa..8b112ea2 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -522,25 +522,16 @@ def app(): ) epc_df = epc_df.merge( - find_my_epc_data[ - [ - asset_list.DOMNA_PROPERTY_ID, "heating_text", "hot_water_text", 'Assessor’s name', - "Assessor's Telephone", "Assessor's Email", "Accreditation scheme", - "Assessor’s ID", "Solar photovoltaics" - ] - ].rename( - columns={ - "Solar photovoltaics": "Has Solar PV", - "heating_text": "Heating Estimated kWh", - "hot_water_text": "Hot Water Estimated kWh", - } - ), + find_my_epc_data[[asset_list.DOMNA_PROPERTY_ID] + list(asset_list.FIND_EPC_DATA_NAMES.keys())] + .rename(columns=asset_list.FIND_EPC_DATA_NAMES), how="left", on=asset_list.DOMNA_PROPERTY_ID ) asset_list.merge_data(epc_df) + asset_list.extract_attributes() + asset_list["Has Solar PV"] = asset_list["Has Solar PV"] | ~asset_list["photo-supply"].isin(["0.0", 0, None, ""]) asset_list = asset_list.drop(columns=["photo-supply"]) From ed333e1714fa9ff3a4f09bc789e5aa37bca0bc8e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 20 Feb 2025 09:04:26 +0000 Subject: [PATCH 51/72] refactored est no floors --- asset_list/AssetList.py | 27 +++++++++++++++++++++++++ etl/route_march_data_pull/app.py | 12 +++++------ recommendations/recommendation_utils.py | 7 +++++-- 3 files changed, 37 insertions(+), 9 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 4ca4c2b8..74469c63 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -15,6 +15,12 @@ import asset_list.mappings.walls as walls_mappings import asset_list.mappings.heating_systems as heating_mappings import asset_list.mappings.exising_pv as existing_pv_mappings +from recommendations.recommendation_utils import ( + estimate_perimeter, + estimate_external_wall_area, + estimate_number_of_floors +) + logger = setup_logger() # OpenAI API Key (set this in your environment variables for security) @@ -224,6 +230,7 @@ class AssetList: "secondheat-description": "epc_secondary_heating", "transaction-type": "epc_reason", "energy-consumption-current": "epc_heat_demand", + "photo-supply": "epc_photo_supply" } FIND_EPC_DATA_NAMES = { "heating_text": "epc_estiamted_heating_kwh", @@ -274,6 +281,7 @@ class AssetList: # Attributes - these are columns that we produce, calcualted based on other pieces of data ATTRIBUTE_HAS_SOLAR = "attribute_has_solar" + ATTRIBUTE_NUMBER_OF_FLOORS = "attribute_est_number_floors" def __init__( self, @@ -645,3 +653,22 @@ class AssetList: self.standardised_asset_list[self.FIND_EPC_DATA_NAMES["Solar photovoltaics"]] | ~self.standardised_asset_list[self.EPC_API_DATA_NAMES["photo-supply"]].isin(["0.0", 0, None, ""]) ) + + accepted_epc_property_types = ["House", "Flat", "Bungalow", "Maisonette"] + + # The logic here is: + # 1) Take the property type provided by the HA themselves + # 2) In absence of that, take the EPC property type + # 3) Otherwise use None + self.standardised_asset_list[self.ATTRIBUTE_NUMBER_OF_FLOORS] = self.standardised_asset_list.apply( + lambda x: estimate_number_of_floors( + property_type=( + x[self.STANDARD_PROPERTY_TYPE].title() if + x[self.STANDARD_PROPERTY_TYPE].title() in accepted_epc_property_types else ( + x[self.EPC_API_DATA_NAMES["property-type"]] if not + pd.isnull(x[self.EPC_API_DATA_NAMES["property-type"]]) else None + ) + ) + ), + axis=1 + ) diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 8b112ea2..9754e726 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -514,7 +514,6 @@ def app(): find_my_epc_data["Solar photovoltaics"] = False # Retrieve just the data we need - epc_df = epc_df[ [asset_list.DOMNA_PROPERTY_ID] + list(asset_list.EPC_API_DATA_NAMES.keys()) ].rename( @@ -529,15 +528,14 @@ def app(): ) asset_list.merge_data(epc_df) + # TODO: TEMP!!! + epc_df["epc_os_uprn"] = epc_df["epc_os_uprn"].astype("Int64").astype(str) + asset_list.standardised_asset_list = asset_list.standardised_asset_list.merge( + epc_df, how="left", left_on="ordnance_survey_uprn", right_on="epc_os_uprn" + ) asset_list.extract_attributes() - asset_list["Has Solar PV"] = asset_list["Has Solar PV"] | ~asset_list["photo-supply"].isin(["0.0", 0, None, ""]) - asset_list = asset_list.drop(columns=["photo-supply"]) - - # Rename the columns - asset_list = asset_list - asset_list["Estimated Number of Floors"] = asset_list.apply( lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull( x["Property Type"]) else None, axis=1 diff --git a/recommendations/recommendation_utils.py b/recommendations/recommendation_utils.py index 00da6107..602684cf 100644 --- a/recommendations/recommendation_utils.py +++ b/recommendations/recommendation_utils.py @@ -205,7 +205,7 @@ def get_wall_u_value( mapped_value = wall_uvalues_df[ wall_uvalues_df["Wall_type"] == mapped_description - ][age_band].values[0] + ][age_band].values[0] if pd.isnull(mapped_value) and "Park home" in mapped_description: # We don't know enough in this case so we default to 0 @@ -428,6 +428,9 @@ def estimate_number_of_floors(property_type): Using the property type, we estimate the number of floors in the property """ + if property_type is None: + return None + if property_type == "House": number_of_floors = 2 elif property_type in ["Flat", "Bungalow"]: @@ -560,7 +563,7 @@ def get_floor_u_value( insulation_lookup = s11[ s11["Age_band"].str.contains(age_band) & s11["Floor_construction"] == floor_type - ] + ] if insulation_lookup.empty: insulation_thickness = 0 else: From 8bf6aa5af23378c0a1a27f6f756f3440d89b6bc4 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 20 Feb 2025 09:20:25 +0000 Subject: [PATCH 52/72] refactoring construction of the attributes --- asset_list/AssetList.py | 65 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 74469c63..5f4436b8 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -21,6 +21,8 @@ from recommendations.recommendation_utils import ( estimate_number_of_floors ) +from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes + logger = setup_logger() # OpenAI API Key (set this in your environment variables for security) @@ -279,9 +281,19 @@ class AssetList: "Any further surveyor notes", 'Surveyors Name' ] + # This SAP threshold is a key search criteria for properties that may be eligible for extraction + SAP_RATING_THRESHOLD = 75 + # Any EPC deemed to have been conducted prior to this year is deemed to be unreliable + EPC_YEAR_THRESHOLD = pd.Timestamp.now().year - 5 + # Attributes - these are columns that we produce, calcualted based on other pieces of data ATTRIBUTE_HAS_SOLAR = "attribute_has_solar" ATTRIBUTE_NUMBER_OF_FLOORS = "attribute_est_number_floors" + ATTRIBUTE_ESTIMATED_PERIMETER = "attribute_est_perimter" + ATTRIBUTE_HEAT_LOSS_AREA = "attribute_heat_loss_area" + ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS = "attribute_epc_roof_insulation_thickness" + ATTRIBUTE_SAP_THRESHOLD_AND_BELOW = f"sap_rating_{SAP_RATING_THRESHOLD}_and_below" + ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD = f"EPC is pre {EPC_YEAR_THRESHOLD}" def __init__( self, @@ -672,3 +684,56 @@ class AssetList: ), axis=1 ) + + self.standardised_asset_list[self.EPC_API_DATA_NAMES["total-floor-area"]] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["total-floor-area"]].astype(float) + ) + # Replace "" value with None + self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]].replace("", None) + ) + self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]].astype(float) + ) + + # Estimate the perimeter + self.standardised_asset_list[self.ATTRIBUTE_ESTIMATED_PERIMETER] = self.standardised_asset_list.apply( + lambda x: estimate_perimeter( + floor_area=x[self.EPC_API_DATA_NAMES["total-floor-area"]] / x[self.ATTRIBUTE_NUMBER_OF_FLOORS], + num_rooms=x[self.EPC_API_DATA_NAMES["number-habitable-rooms"]] / x[self.ATTRIBUTE_NUMBER_OF_FLOORS], + ), axis=1 + ) + + self.standardised_asset_list[self.ATTRIBUTE_HEAT_LOSS_AREA] = self.standardised_asset_list.apply( + lambda x: estimate_external_wall_area( + num_floors=x[self.ATTRIBUTE_NUMBER_OF_FLOORS], + floor_height=( + float(x[self.EPC_API_DATA_NAMES["floor-height"]]) if + x[self.EPC_API_DATA_NAMES["floor-height"]] else 2.5 + ), + perimeter=x[self.ATTRIBUTE_ESTIMATED_PERIMETER], + built_form=x[self.EPC_API_DATA_NAMES["built-form"]] + ), + axis=1 + ) + + self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS] = self.standardised_asset_list.apply( + lambda x: RoofAttributes(description=x[self.EPC_API_DATA_NAMES["roof-description"]]).process()[ + "insulation_thickness"] if not pd.isnull( + x[self.EPC_API_DATA_NAMES["roof-description"]]) else None, + axis=1 + ) + + # We produce some additional fields + # 1) Is the SAP rating below C75 + self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= + self.SAP_RATING_THRESHOLD + ) + # 2) Flag anything where the EPC is older than 5 years + + self.standardised_asset_list[self.ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD] = ( + pd.to_datetime( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["lodgement-date"]] + ).dt.year < self.EPC_YEAR_THRESHOLD + ) From c0ebffb6cbab5d4f4e2d24f82f352cb8b7024638 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 20 Feb 2025 20:50:05 +0000 Subject: [PATCH 53/72] coding up logic to identify work types --- asset_list/AssetList.py | 250 ++++++++++++++++++++++++- asset_list/mappings/heating_systems.py | 1 + asset_list/mappings/walls.py | 27 ++- etl/route_march_data_pull/app.py | 164 +--------------- 4 files changed, 270 insertions(+), 172 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 5f4436b8..81aa525a 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -10,6 +10,7 @@ import pandas as pd from fuzzywuzzy import process from utils.logger import setup_logger from backend.SearchEpc import SearchEpc +from BaseUtility import Definitions import asset_list.mappings.property_type as property_type_mappings import asset_list.mappings.walls as walls_mappings import asset_list.mappings.heating_systems as heating_mappings @@ -282,7 +283,9 @@ class AssetList: ] # This SAP threshold is a key search criteria for properties that may be eligible for extraction - SAP_RATING_THRESHOLD = 75 + FILLED_CAVITY_SAP_THRESHOLD = 75 + # This SAP the + EMPTY_CAVITY_SAP_THRESHOLD = 71 # Any EPC deemed to have been conducted prior to this year is deemed to be unreliable EPC_YEAR_THRESHOLD = pd.Timestamp.now().year - 5 @@ -292,9 +295,17 @@ class AssetList: ATTRIBUTE_ESTIMATED_PERIMETER = "attribute_est_perimter" ATTRIBUTE_HEAT_LOSS_AREA = "attribute_heat_loss_area" ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS = "attribute_epc_roof_insulation_thickness" - ATTRIBUTE_SAP_THRESHOLD_AND_BELOW = f"sap_rating_{SAP_RATING_THRESHOLD}_and_below" + ATTRIBUTE_SAP_THRESHOLD_AND_BELOW = f"sap_rating_{FILLED_CAVITY_SAP_THRESHOLD}_and_below" ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD = f"EPC is pre {EPC_YEAR_THRESHOLD}" + # These are the descriptions that we look for in the EPC data that are indicative of no insulation + EPC_NO_WALL_INSULATION_DESCRIPTIONS = [ + "cavity wall, as built, no insulation (assumed)", + "cavity wall, as built, partial insulation (assumed)", + "cavity wall, as built, partial insulation", + "cavity wall, as built, no insulation", + ] + def __init__( self, local_filepath, @@ -728,12 +739,241 @@ class AssetList: # 1) Is the SAP rating below C75 self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] = ( self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= - self.SAP_RATING_THRESHOLD + self.FILLED_CAVITY_SAP_THRESHOLD ) # 2) Flag anything where the EPC is older than 5 years - self.standardised_asset_list[self.ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD] = ( pd.to_datetime( - self.standardised_asset_list[self.EPC_API_DATA_NAMES["lodgement-date"]] + self.standardised_asset_list[self.EPC_API_DATA_NAMES["inspection-date"]] ).dt.year < self.EPC_YEAR_THRESHOLD ) + + self.process_age_band() + + def process_age_band(self): + processed_age_band = [] + for _, x in self.standardised_asset_list.iterrows(): + + if pd.isnull(x[self.EPC_API_DATA_NAMES["construction-age-band"]]) or ( + x[self.EPC_API_DATA_NAMES["construction-age-band"]] in Definitions.DATA_ANOMALY_MATCHES + ): + processed_age_band.append( + { + self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], + "epc_year_lower_bound": None, + "epc_year_upper_bound": None, + "Does Age Match EPC Age Band?": "No EPC Age Band" + } + ) + continue + + # We exatract the upper and lower bounds + if x[self.EPC_API_DATA_NAMES["construction-age-band"]] in [ + "England and Wales: 2007 onwards", "England and Wales: 2012 onwards" + ]: + year_lower_bound = 2007 if x[self.EPC_API_DATA_NAMES[ + "construction-age-band"]] == "England and Wales: 2007 onwards" else 2012 + + if pd.isnull(x[self.STANDARD_YEAR_BUILT]): + age_band_matches = "No Year Built From Landlord" + else: + age_band_matches = ( + "EPC Age Band Matches Year Built" if x[self.STANDARD_YEAR_BUILT] >= year_lower_bound + else "EPC Age Band is older than Year Built" + ) + + processed_age_band.append( + { + self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], + "epc_year_lower_bound": year_lower_bound, + "epc_year_upper_bound": None, + "Does Age Match EPC Age Band?": age_band_matches + } + ) + continue + + if x[self.EPC_API_DATA_NAMES["construction-age-band"]] == "England and Wales: before 1900": + + if pd.isnull(x[self.STANDARD_YEAR_BUILT]): + age_band_matches = "No Year Built From Landlord" + else: + age_band_matches = ( + "EPC Age Band Matches Year Built" if x[self.STANDARD_YEAR_BUILT] < 1900 + else "EPC Age Band is newer than Year Built" + ) + + processed_age_band.append( + { + self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], + "epc_year_lower_bound": None, + "epc_year_upper_bound": 1899, + "Does Age Match EPC Age Band?": age_band_matches + } + ) + continue + + if x[self.EPC_API_DATA_NAMES["construction-age-band"]].isdigit(): + + if pd.isnull(x[self.STANDARD_YEAR_BUILT]): + age_band_matches = "No Year Built From Landlord" + else: + age_band_matches = ( + "EPC Age Band Matches Year Built" if x[self.STANDARD_YEAR_BUILT] == int( + x[self.EPC_API_DATA_NAMES["construction-age-band"]] + ) + else "EPC Age Band is different from Year Built" + ) + + processed_age_band.append( + { + self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], + "epc_year_lower_bound": int(x[self.EPC_API_DATA_NAMES["construction-age-band"]]), + "epc_year_upper_bound": int(x[self.EPC_API_DATA_NAMES["construction-age-band"]]), + "Does Age Match EPC Age Band?": age_band_matches + } + ) + continue + + # Oherwise, we extract the upper and lower bounds + age_band = x[self.EPC_API_DATA_NAMES["construction-age-band"]].split(": ")[1] + lower_date, upper_date = age_band.split("-") + + age_band_matches = ( + "EPC Age Band Matches Year Built" if (x[self.STANDARD_YEAR_BUILT] >= float(lower_date)) and ( + x[self.STANDARD_YEAR_BUILT] <= float(upper_date) + ) + else "EPC Age Band is older than Year Built" if x[self.STANDARD_YEAR_BUILT] > float(upper_date) + else "EPC Age Band is newer than Year Built" + ) + + processed_age_band.append( + { + self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], + "epc_year_lower_bound": int(lower_date), + "epc_year_upper_bound": int(upper_date), + "Does Age Match EPC Age Band?": age_band_matches + } + ) + + processed_age_band = pd.DataFrame(processed_age_band) + + self.standardised_asset_list = self.standardised_asset_list.merge( + processed_age_band, how="left" + ) + + def identify_worktypes(self): + + # If we have non-intrusives completed, we can use this to identify work types + + if self.non_intrusives_present: + ###################################################### + # Empty cavity: + ###################################################### + # 1) Has been flagged on the non-intrusives as being a cavity wall, empty or partially filled + # 2) The age is before 1995 + # TODO: 3) Remove anything that likley has access issues + self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] = ( + (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & + (self.standardised_asset_list['non-intrusives: Construction'] == "CAVITY") & + self.standardised_asset_list['non-intrusives: Insulated'].isin(["EMPTY", "PARTIAL"]) & + (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2000) + ) + + self.standardised_asset_list["epc_indicates_empty_cavity"] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]].str.lower().isin( + self.EPC_NO_WALL_INSULATION_DESCRIPTIONS + ) & ( + self.standardised_asset_list["epc_year_upper_bound"] <= 1995 + ) & ( + ~self.standardised_asset_list[self.ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD] + ) & ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= self.EMPTY_CAVITY_SAP_THRESHOLD + ) + ) + + ###################################################### + # Extraction + ###################################################### + + # TODO When filterting like this, 627 properties are flagged as not needing a CIGA check and 582 are flagged + # as needing a CIGA check. What is the logic we should be applying here? + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = ( + (self.standardised_asset_list["non-intrusives: Construction"] == "CAVITY") & + (self.standardised_asset_list["non-intrusives: Insulated"].isin(["RETRO DRILLED", "FILLED AT BUILD"])) & + (~self.standardised_asset_list['non-intrusives: Material'].isin(["GREY LOOSE BEAD", "FORMALDEHYDE"]) + ) & ( + self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] + ) + ) + + ###################################################### + # Solar + ###################################################### + # Criteria: + + # TODO: Standardise these columns with our cleaned_data object + + # Check 1: Does the property have a valid heating system? + self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] = ( + self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM].isin( + ["air source heat pump", "ground source heat pump", "high heat retention storage heaters"] + ) + ) + + self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] = ( + ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["mainheat-description"]] + .str.lower().str.contains("air source heat pump|ground source heat pump") + ) | ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["mainheat-description"]].str.lower().str.contains( + "electric storage heaters" + ) & ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES[ + "mainheatcont-description"]] == "Controls for high heat retention storage heaters" + ) + ) + ) + + # Check 2: Does the property have solar already + self.standardised_asset_list["property_has_solar"] = ( + (self.standardised_asset_list[self.STANDARD_EXISTING_PV] == "already has PV") | + (self.standardised_asset_list["non-intrusives: PV, ACCESS ISSUE, SEE NOTES"] == "SOLAR PV ON ROOF") | + (self.standardised_asset_list[self.ATTRIBUTE_HAS_SOLAR]) + ) + + # Check 3: Does the property meet the fabric condition + # Solar PV installs are subject to the minimum insulation requirements which means: + # 1) one of the following insulation measures must be installed as part of the same + # ECO4 project: + # • roof insulation (flat roof, pitched roof, room-in-roof) + # • exterior facing wall insulation (cavity wall, solid wall) + # • party cavity wall insulation + # • floor insulation (solid and underfloor) + # + # OR + # + # all measures (except any exempted measure referred to in paragraph 4.28) + # listed in paragraph a) must already be installed + # + # With this in mind, we look for 2 clases + # 1) The property is fully insulated apart from the loft (<200mm insulation) + # 2) THe property is fully insulated + + self.standardised_asset_list["solar_landlord_walls_insulated"] = ( + self.standardised_asset_list[self.STANDARD_WALL_CONSTRUCTION].isin( + ["filled cavity", "insulated solid brick"] + ) + ) + + EPC_INSULATED_WALLS_SUBSTRINGS = [ + ", insulated", "with external insulation", "with internal insulation", "filled cavity" + ] + + self.standardised_asset_list["landlord_wall_construction"].value_counts() + + EPC_INSULATED_ROOF_SUBSTRINGS = [ + "(another dwelling above)", "limited insulation", "(other premises above)", + ", no insulation", + ] diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py index 2fbdff70..89bfe0c4 100644 --- a/asset_list/mappings/heating_systems.py +++ b/asset_list/mappings/heating_systems.py @@ -13,6 +13,7 @@ STANDARD_HEATING_SYSTEMS = { "electric boiler", "unknown", "communal gas boiler", + "high heat retention storage heaters", } HEATING_MAPPINGS = { diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py index 33db1fef..c5cca599 100644 --- a/asset_list/mappings/walls.py +++ b/asset_list/mappings/walls.py @@ -1,8 +1,10 @@ from asset_list.AssetList import DataRemapper STANDARD_WALL_CONSTRUCTIONS = { - "uninsulated cavity", "filled cavity", "partial insulated cavity", "timber frame", "solid brick", - "system built", "granite or whinstone", "other", "unknown", "sandstone or limestone", "cob", + "uninsulated cavity", "filled cavity", "partial insulated cavity", "cavity unknown insulation", + "timber frame", "uninsulated solid brick", + "insulated solid brick", "system built", "granite or whinstone", "other", "unknown", "sandstone or limestone", + "cob", "new build - average thermal transmittance", } @@ -26,7 +28,8 @@ WALL_CONSTRUCTION_MAPPINGS = { 'Average thermal transmittance 0.26 W/m-¦K': 'unknown', 'Average thermal transmittance 0.62 W/m?K': 'unknown', 'Average thermal transmittance 0.64 W/m?K': 'unknown', 'Average thermal transmittance 0.61 W/m?K': 'unknown', 'Sandstone or limestone, as built, no insulation (assumed)': 'sandstone or limestone', - 'Average thermal transmittance 0.33 W/m?K': 'unknown', 'Cavity wall,': 'unknown', + 'Average thermal transmittance 0.33 W/m?K': 'unknown', + 'Cavity wall,': "cavity unknown insulation", 'Cavity wall, as built, partial insulation (assumed)': 'partial insulated cavity', 'Average thermal transmittance 0.29 W/m-¦K': 'unknown', 'Average thermal transmittance 0.32 W/m-¦K': 'unknown', 'Average thermal transmittance 0.19 W/m-¦K': 'unknown', 'Average thermal transmittance 0.27 W/m?K': 'unknown', @@ -55,7 +58,7 @@ WALL_CONSTRUCTION_MAPPINGS = { 'average thermal transmittance 0.26 w/m-¦k': 'unknown', 'average thermal transmittance 0.62 w/m?k': 'unknown', 'average thermal transmittance 0.64 w/m?k': 'unknown', 'average thermal transmittance 0.61 w/m?k': 'unknown', 'sandstone or limestone, as built, no insulation (assumed)': 'sandstone or limestone', - 'average thermal transmittance 0.33 w/m?k': 'unknown', 'cavity wall,': 'unknown', + 'average thermal transmittance 0.33 w/m?k': 'unknown', 'cavity wall,': "cavity unknown insulation", 'cavity wall, as built, partial insulation (assumed)': 'partial insulated cavity', 'average thermal transmittance 0.29 w/m-¦k': 'unknown', 'average thermal transmittance 0.32 w/m-¦k': 'unknown', 'average thermal transmittance 0.19 w/m-¦k': 'unknown', 'average thermal transmittance 0.27 w/m?k': 'unknown', @@ -67,4 +70,20 @@ WALL_CONSTRUCTION_MAPPINGS = { 'average thermal transmittance 0.32 w/m?k': 'unknown', 'average thermal transmittance 0.24 w/m-¦k': 'unknown', 'cavity wall, with internal insulation': 'filled cavity', 'average thermal transmittance 0.17 w/m-¦k': 'unknown', 'average thermal transmittance 0.28 w/m?k': 'unknown', + 'Cavity wall, filled cavity': 'filled cavity', + 'Cavity wall, filled cavity and external insulation': 'filled cavity', + 'Granite or whinstone, as built, no insulation (assumed)': 'granite or ' + 'whinstone', + 'Solid brick, as built, insulated (assumed)': 'insulated solid brick', + 'Solid brick, as built, no insulation (assumed)': 'uninsulated solid brick', + 'Solid brick, with external insulation': 'insulated solid brick', + 'Solid brick, with internal insulation': 'insulated solid brick', + 'System built, as built, insulated (assumed)': 'system built', + 'System built, as built, no insulation (assumed)': 'system built', + 'System built, with external insulation': 'system built', + 'System built, with internal insulation': 'system built', + 'Timber frame, as built, insulated (assumed)': 'timber frame', + 'Timber frame, as built, no insulation (assumed)': 'timber frame', + 'Timber frame, as built, partial insulation (assumed)': 'timber frame', + 'Timber frame, with additional insulation': 'timber frame', } diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 9754e726..fbf7e10d 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -4,7 +4,6 @@ import json import pandas as pd import numpy as np from tqdm import tqdm -from BaseUtility import Definitions from asset_list.AssetList import AssetList from asset_list.mappings.property_type import PROPERTY_MAPPING from asset_list.mappings.walls import WALL_CONSTRUCTION_MAPPINGS @@ -14,13 +13,6 @@ from asset_list.mappings.exising_pv import EXISTING_PV_MAPPINGS from dotenv import load_dotenv from backend.SearchEpc import SearchEpc from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc -from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes - -from recommendations.recommendation_utils import ( - estimate_perimeter, - estimate_external_wall_area, - estimate_number_of_floors -) from etl.epc_clean.epc_attributes.attribute_utils import ( extract_thermal_transmittance @@ -177,109 +169,6 @@ def extract_address1(asset_list, full_address_col, postcode_col, method="first_t raise ValueError(f"Method {method} not recognized") -def process_age_band(asset_list, year_built_column): - processed_age_band = [] - for _, x in asset_list.iterrows(): - - if pd.isnull(x["Property Age Band"]) or ( - x["Property Age Band"] in Definitions.DATA_ANOMALY_MATCHES - ): - processed_age_band.append({ - "row_id": x["row_id"], - "epc_year_lower_bound": None, - "epc_year_upper_bound": None, - "Does Age Match EPC Age Band?": "No EPC Age Band" - }) - continue - - # We exatract the upper and lower bounds - if x["Property Age Band"] in ["England and Wales: 2007 onwards", "England and Wales: 2012 onwards"]: - year_lower_bound = 2007 if x["Property Age Band"] == "England and Wales: 2007 onwards" else 2012 - - if pd.isnull(x[year_built_column]): - age_band_matches = "No Year Built From Landlord" - else: - age_band_matches = ( - "EPC Age Band Matches Year Built" if x[year_built_column] >= year_lower_bound - else "EPC Age Band is older than Year Built" - ) - - processed_age_band.append( - { - "row_id": x["row_id"], - "epc_year_lower_bound": year_lower_bound, - "epc_year_upper_bound": None, - "Does Age Match EPC Age Band?": age_band_matches - } - ) - continue - - if x["Property Age Band"] == "England and Wales: before 1900": - - if pd.isnull(x[year_built_column]): - age_band_matches = "No Year Built From Landlord" - else: - age_band_matches = ( - "EPC Age Band Matches Year Built" if x[year_built_column] < 1900 - else "EPC Age Band is newer than Year Built" - ) - - processed_age_band.append( - { - "row_id": x["row_id"], - "epc_year_lower_bound": None, - "epc_year_upper_bound": 1899, - "Does Age Match EPC Age Band?": age_band_matches - } - ) - continue - - if x["Property Age Band"].isdigit(): - - if pd.isnull(x[year_built_column]): - age_band_matches = "No Year Built From Landlord" - else: - age_band_matches = ( - "EPC Age Band Matches Year Built" if x[year_built_column] == int(x["Property Age Band"]) - else "EPC Age Band is different from Year Built" - ) - - processed_age_band.append( - { - "row_id": x["row_id"], - "epc_year_lower_bound": int(x["Property Age Band"]), - "epc_year_upper_bound": int(x["Property Age Band"]), - "Does Age Match EPC Age Band?": age_band_matches - } - ) - continue - - # Oherwise, we extract the upper and lower bounds - age_band = x["Property Age Band"].split(": ")[1] - lower_date, upper_date = age_band.split("-") - - age_band_matches = ( - "EPC Age Band Matches Year Built" if (x[year_built_column] >= float(lower_date)) and ( - x[year_built_column] <= float(upper_date) - ) - else "EPC Age Band is older than Year Built" if x[year_built_column] > float(upper_date) - else "EPC Age Band is newer than Year Built" - ) - - processed_age_band.append( - { - "row_id": x["row_id"], - "epc_year_lower_bound": int(lower_date), - "epc_year_upper_bound": int(upper_date), - "Does Age Match EPC Age Band?": age_band_matches - } - ) - - processed_age_band = pd.DataFrame(processed_age_band) - - return processed_age_band - - def app(): """ This app is EPC pulling data for some properties owned by Livewest @@ -531,62 +420,11 @@ def app(): # TODO: TEMP!!! epc_df["epc_os_uprn"] = epc_df["epc_os_uprn"].astype("Int64").astype(str) asset_list.standardised_asset_list = asset_list.standardised_asset_list.merge( - epc_df, how="left", left_on="ordnance_survey_uprn", right_on="epc_os_uprn" + epc_df.drop(columns=["domna_property_id"]), how="left", left_on="ordnance_survey_uprn", right_on="epc_os_uprn" ) asset_list.extract_attributes() - asset_list["Estimated Number of Floors"] = asset_list.apply( - lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull( - x["Property Type"]) else None, axis=1 - ) - - asset_list["Property Floor Area"] = asset_list["Property Floor Area"].astype(float) - # Replace "" value with None - asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].replace("", None) - asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].astype(float) - - asset_list["Estimated Perimeter (m)"] = asset_list.apply( - lambda x: estimate_perimeter( - floor_area=x["Property Floor Area"] / x["Estimated Number of Floors"], - num_rooms=x["Number of Habitable Rooms"] / x["Estimated Number of Floors"], - ), axis=1 - ) - - asset_list["Estimated Heat Loss Perimeter (m2)"] = asset_list.apply( - lambda x: estimate_external_wall_area( - num_floors=x["Estimated Number of Floors"], - floor_height=float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5, - perimeter=x["Estimated Perimeter (m)"], - built_form=x["Archetype - EPC"] - ), - axis=1 - ) - - asset_list["Roof Insulation Thickness"] = asset_list.apply( - lambda x: RoofAttributes(description=x["Roof Construction"]).process()["insulation_thickness"] if not pd.isnull( - x["Roof Construction"]) else None, - axis=1 - ) - - # We produce some additional fields - # 1) Is the SAP rating below C75 - asset_list["SAP Rating is 75 and below"] = asset_list["SAP score on register"] <= 75 - # 2) Flag anything where the EPC is older than 5 years - cutoff_year = pd.Timestamp.now().year - 5 - asset_list[f"EPC is pre {cutoff_year}"] = ( - pd.to_datetime(asset_list["Date of last EPC"]).dt.year < cutoff_year - ) - - # 3) If we have year in the asset list, we flag entries where the built year is different from the - # EPC Age band - if PROPERTY_YEAR_BUILT is not None: - # We process the age band and merge it on - processed_age_band = process_age_band(asset_list, PROPERTY_YEAR_BUILT) - asset_list = asset_list.merge( - processed_age_band, how="left", on="row_id" - ) - if HAS_NON_INTRUSIVES: # Empty cavity: # 1) Has been flagged on the non-intrusives as being empty or partially filled From 4db9d48e366e121abcfe83e2dfd335d33151bc68 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 21 Feb 2025 12:39:06 +0000 Subject: [PATCH 54/72] adding the solar floor eligibiltiy criteria --- asset_list/AssetList.py | 85 ++++++++++++++++++++++++++++---- asset_list/requirements.txt | 3 +- etl/route_march_data_pull/app.py | 28 ++++++++++- 3 files changed, 105 insertions(+), 11 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 81aa525a..4666cf63 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -306,6 +306,17 @@ class AssetList: "cavity wall, as built, no insulation", ] + # List of strings that we look for in the EPC data, where substrings indicate that the wall is insulated + EPC_INSULATED_WALLS_SUBSTRINGS = [ + ", insulated", "with external insulation", "with internal insulation", "filled cavity" + ] + + # List of strings that we look for in the EPC data, where substrings indicate that the roof is insulated + EPC_INSULATED_ROOF_SUBSTRINGS = [ + "(another dwelling above)", ", insulated", ", insulated (assumed) ", + ", ceiling insulated", + ] + def __init__( self, local_filepath, @@ -861,7 +872,10 @@ class AssetList: processed_age_band, how="left" ) - def identify_worktypes(self): + def identify_worktypes(self, cleaned): + + if not self.non_intrusives_present: + raise NotImplementedError("Need to implement the case for non-intrusives") # If we have non-intrusives completed, we can use this to identify work types @@ -892,6 +906,17 @@ class AssetList: ) ) + self.standardised_asset_list["empty_cavity"] = ( + self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] | + self.standardised_asset_list["epc_indicates_empty_cavity"] + ) + # We add a reason + self.standardised_asset_list["empty_cavity_reason"] = np.where( + self.standardised_asset_list["non_intrusive_indicates_empty_cavity"], + "Non-Intrusive Data", + "EPC Data" + ) + ###################################################### # Extraction ###################################################### @@ -967,13 +992,55 @@ class AssetList: ) ) - EPC_INSULATED_WALLS_SUBSTRINGS = [ - ", insulated", "with external insulation", "with internal insulation", "filled cavity" - ] + # TODO: We don't have information about the roof from this landlord + self.standardised_asset_list["solar_epc_walls_insulated"] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]].str.lower().str.contains( + "|".join(self.EPC_INSULATED_WALLS_SUBSTRINGS) + ) + ) - self.standardised_asset_list["landlord_wall_construction"].value_counts() + # We merge on the u-value for average thermal transmittance + roof_uvalue_data = pd.DataFrame(cleaned["roof-description"]) + roof_uvalue_data = roof_uvalue_data[ + ~pd.isnull(roof_uvalue_data["thermal_transmittance"]) + ][["original_description", "thermal_transmittance"]].rename( + columns={ + "original_description": self.EPC_API_DATA_NAMES["roof-description"], + "thermal_transmittance": "roof_u_value" + } + ) - EPC_INSULATED_ROOF_SUBSTRINGS = [ - "(another dwelling above)", "limited insulation", "(other premises above)", - ", no insulation", - ] + self.standardised_asset_list = self.standardised_asset_list.merge( + roof_uvalue_data, how="left", on=self.EPC_API_DATA_NAMES["roof-description"] + ) + + # If the u-value of a roof is less than 0.7 we consider it insulated + self.standardised_asset_list["solar_epc_roof_insulated"] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["roof-description"]].str.lower().str.contains( + "|".join(self.EPC_INSULATED_ROOF_SUBSTRINGS), regex=False + ) | ( + self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].apply( + lambda x: int(x) >= 270 if str(x).isdigit() else False + ) + ) | ( + self.standardised_asset_list["roof_u_value"].apply( + lambda x: x <= 0.7 if not pd.isnull(x) else False + ) + ) + ) + + self.standardised_asset_list["solar_epc_loft_needs_topup"] = self.standardised_asset_list[ + self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].apply( + lambda x: int(x) < 270 if str(x).isdigit() else False + ) + + self.standardised_asset_list["solar_epc_floor_is_solid"] = self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["floor-description"] + ].str.lower().str.contains("solid") + self.standardised_asset_list["solar_epc_floor_is_solid"] = ( + self.standardised_asset_list["solar_epc_floor_is_solid"].fillna(False) + ) + + z = self.standardised_asset_list[ + self.standardised_asset_list["solar_epc_floor_is_solid"] == True + ] diff --git a/asset_list/requirements.txt b/asset_list/requirements.txt index 0c16c43a..fd045d46 100644 --- a/asset_list/requirements.txt +++ b/asset_list/requirements.txt @@ -7,4 +7,5 @@ fuzzywuzzy boto3 openpyxl openai -tiktoken \ No newline at end of file +tiktoken +msgpack \ No newline at end of file diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index fbf7e10d..32c36fe8 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -391,13 +391,28 @@ def app(): transformed_df = pd.DataFrame(transformed_data) # At the moment, we're only using a limited set of columns - let's jut keep cavity wall insulation # recommendations - transformed_df = transformed_df[[asset_list.DOMNA_PROPERTY_ID, "Cavity wall insulation"]] + transformed_df = transformed_df[ + [ + asset_list.DOMNA_PROPERTY_ID, "Cavity wall insulation", "Floor insulation (solid floor)", + "Floor insulation", "Floor insulation (suspended floor)" + ] + ] + + transformed_df["epc_has_floor_recommendation"] = ( + transformed_df["Floor insulation (solid floor)"] | transformed_df["Floor insulation"] | + transformed_df["Floor insulation (suspended floor)"] + ) # Get the find my epc data find_my_epc_data = epc_df[[asset_list.DOMNA_PROPERTY_ID, "find_my_epc_data"]].drop( columns=["find_my_epc_data"]).join( pd.json_normalize(epc_df["find_my_epc_data"]) ) + find_my_epc_data = find_my_epc_data.merge( + transformed_df[[asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"]], + how="left", on=asset_list.DOMNA_PROPERTY_ID + ) + # We check if we get the solar pv column: if "Solar photovoltaics" not in find_my_epc_data.columns: find_my_epc_data["Solar photovoltaics"] = False @@ -425,6 +440,17 @@ def app(): asset_list.extract_attributes() + # TODO - Use this! + import msgpack + from utils.s3 import read_from_s3 + cleaned = read_from_s3( + s3_file_name="cleaned_epc_data/cleaned.bson", + bucket_name="retrofit-data-dev" + ) + cleaned = msgpack.unpackb(cleaned, raw=False) + + asset_list.identify_worktypes(cleaned) + if HAS_NON_INTRUSIVES: # Empty cavity: # 1) Has been flagged on the non-intrusives as being empty or partially filled From c544c95282df3a9c50fc84ab46bd387f889a4b4d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 21 Feb 2025 14:35:12 +0000 Subject: [PATCH 55/72] working on solar criteria --- asset_list/AssetList.py | 105 +++++++++++++++++++++++++++---- etl/route_march_data_pull/app.py | 8 +-- 2 files changed, 96 insertions(+), 17 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 4666cf63..056f8b5d 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -936,9 +936,6 @@ class AssetList: # Solar ###################################################### # Criteria: - - # TODO: Standardise these columns with our cleaned_data object - # Check 1: Does the property have a valid heating system? self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] = ( self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM].isin( @@ -993,9 +990,35 @@ class AssetList: ) # TODO: We don't have information about the roof from this landlord + + # We merge on the u-value for average thermal transmittance + walls_uvalue_data = pd.DataFrame(cleaned["walls-description"]) + walls_uvalue_data = walls_uvalue_data[ + ~pd.isnull(walls_uvalue_data["thermal_transmittance"]) + ][["original_description", "thermal_transmittance"]].rename( + columns={ + "original_description": self.EPC_API_DATA_NAMES["walls-description"], + "thermal_transmittance": "walls_u_value" + } + ) + self.standardised_asset_list = self.standardised_asset_list.merge( + walls_uvalue_data, how="left", on=self.EPC_API_DATA_NAMES["walls-description"] + ) + self.standardised_asset_list["solar_epc_walls_insulated"] = ( - self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]].str.lower().str.contains( - "|".join(self.EPC_INSULATED_WALLS_SUBSTRINGS) + ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES[ + "walls-description"]].str.lower().str.contains( + "|".join( + self.EPC_INSULATED_WALLS_SUBSTRINGS) + ) + ) | ( + self.standardised_asset_list[ + "walls_u_value"].apply( + lambda x: x <= 0.3 if not pd.isnull( + x) else False + ) ) ) @@ -1034,13 +1057,69 @@ class AssetList: lambda x: int(x) < 270 if str(x).isdigit() else False ) - self.standardised_asset_list["solar_epc_floor_is_solid"] = self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["floor-description"] - ].str.lower().str.contains("solid") - self.standardised_asset_list["solar_epc_floor_is_solid"] = ( - self.standardised_asset_list["solar_epc_floor_is_solid"].fillna(False) + # TODO: Fill with False - should be temp! + self.standardised_asset_list["epc_has_floor_recommendation"] = ( + self.standardised_asset_list["epc_has_floor_recommendation"].fillna(False) ) - z = self.standardised_asset_list[ - self.standardised_asset_list["solar_epc_floor_is_solid"] == True - ] + self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] = ( + ( + ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["floor-description"]].str + .lower().str.contains("solid") + ) & ( + ~self.standardised_asset_list["epc_has_floor_recommendation"] + ) + ) | ( + ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["floor-description"]].str.contains("solid") + ) & ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["floor-description"]].str.lower() + .str.contains(", insulated") + ) + ) + ) + + # We now put together the criteria: + # Flag properties that look eligible for solar, that have solid floors + # TODO: We'll need to revise this + self.standardised_asset_list["solar_eligible_solid_floor"] = ( + # Landlord data or EPC data indicates the heating system is appropriate + ( + self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] | + self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] + ) & + # The property doesn't currently have solar + ~self.standardised_asset_list["property_has_solar"] & + # The walls are insulated + ( + self.standardised_asset_list["solar_landlord_walls_insulated"] | + self.standardised_asset_list["solar_epc_walls_insulated"] + ) & + # Roof is insulated + self.standardised_asset_list["solar_epc_roof_insulated"] & + self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] + ) + + # Solid floor but needs a loft top-up + self.standardised_asset_list["solar_eligible_solid_floor_needs_loft"] = ( + # Landlord data or EPC data indicates the heating system is appropriate + ( + self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] | + self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] + ) & + # The property doesn't currently have solar + ~self.standardised_asset_list["property_has_solar"] & + # The walls are insulated + ( + self.standardised_asset_list["solar_landlord_walls_insulated"] | + self.standardised_asset_list["solar_epc_walls_insulated"] + ) & + # Roof is insulated + self.standardised_asset_list["solar_epc_loft_needs_topup"] & + self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] + ) + + # Suspended floor, fully insulated + + # ~self.standardised_asset_list["solar_epc_loft_needs_topup"] & diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 32c36fe8..0de85a27 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -389,11 +389,9 @@ def app(): transformed_data.append(row_data) transformed_df = pd.DataFrame(transformed_data) - # At the moment, we're only using a limited set of columns - let's jut keep cavity wall insulation - # recommendations transformed_df = transformed_df[ [ - asset_list.DOMNA_PROPERTY_ID, "Cavity wall insulation", "Floor insulation (solid floor)", + asset_list.DOMNA_PROPERTY_ID, "Floor insulation (solid floor)", "Floor insulation", "Floor insulation (suspended floor)" ] ] @@ -425,7 +423,9 @@ def app(): ) epc_df = epc_df.merge( - find_my_epc_data[[asset_list.DOMNA_PROPERTY_ID] + list(asset_list.FIND_EPC_DATA_NAMES.keys())] + find_my_epc_data[ + [asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"] + list(asset_list.FIND_EPC_DATA_NAMES.keys()) + ] .rename(columns=asset_list.FIND_EPC_DATA_NAMES), how="left", on=asset_list.DOMNA_PROPERTY_ID From 84ae26a9133e91a3f1904db2407f2f84bfb7305a Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 21 Feb 2025 15:14:18 +0000 Subject: [PATCH 56/72] added the eligibility criteria for solar and aggregate figures: --- asset_list/AssetList.py | 117 ++++++++++++++++++++++++++++++- etl/route_march_data_pull/app.py | 1 - 2 files changed, 114 insertions(+), 4 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 056f8b5d..ffe53d40 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -341,6 +341,8 @@ class AssetList: # Read in the data self.raw_asset_list = pd.read_excel(local_filepath, header=header, sheet_name=sheet_name) self.standardised_asset_list = self.raw_asset_list.copy() + # Will be used to store aggregated figures against the various work types + self.work_type_figures = {} # We detect the presence of the non-intrusive columns self.non_intrusives_present = True if "CIGA Check Required" in self.raw_asset_list.columns else False @@ -1062,6 +1064,23 @@ class AssetList: self.standardised_asset_list["epc_has_floor_recommendation"].fillna(False) ) + # We merge on the u-value for average thermal transmittance + floors_uvalue_data = pd.DataFrame(cleaned["floor-description"]) + floors_uvalue_data = floors_uvalue_data[ + ~pd.isnull(floors_uvalue_data["thermal_transmittance"]) + ][["original_description", "thermal_transmittance"]].rename( + columns={ + "original_description": self.EPC_API_DATA_NAMES["floor-description"], + "thermal_transmittance": "floor_u_value" + } + ) + + # Merge on + self.standardised_asset_list = self.standardised_asset_list.merge( + floors_uvalue_data, how="left", on=self.EPC_API_DATA_NAMES["floor-description"] + ) + + # We assume that a U-value of 0.5 or below is indicative of an insulated floor self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] = ( ( ( @@ -1072,7 +1091,8 @@ class AssetList: ) ) | ( ( - self.standardised_asset_list[self.EPC_API_DATA_NAMES["floor-description"]].str.contains("solid") + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["floor-description"]].str.lower().str.contains("solid") ) & ( self.standardised_asset_list[self.EPC_API_DATA_NAMES["floor-description"]].str.lower() .str.contains(", insulated") @@ -1080,6 +1100,33 @@ class AssetList: ) ) + # Check for other floor types, insulated + self.standardised_asset_list["solar_epc_floor_is_other_insulated"] = ( + # The floor is suspended and insulated + ( + ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["floor-description"]].str + .lower().str.contains("suspended") + ) & ( + ~self.standardised_asset_list["epc_has_floor_recommendation"] + ) + ) | ( + ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["floor-description"] + ].str.lower().str.contains("suspended") + ) & ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["floor-description"] + ].str.lower().str.contains(", insulated") + ) + ) | ( + self.standardised_asset_list["floor_u_value"].apply( + lambda x: x <= 0.5 if not pd.isnull(x) else False + ) + ) + ) + # We now put together the criteria: # Flag properties that look eligible for solar, that have solid floors # TODO: We'll need to revise this @@ -1120,6 +1167,70 @@ class AssetList: self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] ) - # Suspended floor, fully insulated + # Other floor type, fully insulated + self.standardised_asset_list["solar_eligible_other_floor"] = ( + # Landlord data or EPC data indicates the heating system is appropriate + ( + self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] | + self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] + ) & + # The property doesn't currently have solar + ~self.standardised_asset_list["property_has_solar"] & + # The walls are insulated + ( + self.standardised_asset_list["solar_landlord_walls_insulated"] | + self.standardised_asset_list["solar_epc_walls_insulated"] + ) & + # Roof is insulated + self.standardised_asset_list["solar_epc_roof_insulated"] & + self.standardised_asset_list["solar_epc_floor_is_other_insulated"] + ) - # ~self.standardised_asset_list["solar_epc_loft_needs_topup"] & + # Other floor type, needs loft top-up + self.standardised_asset_list["solar_eligible_other_floor_needs_loft"] = ( + # Landlord data or EPC data indicates the heating system is appropriate + ( + self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] | + self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] + ) & + # The property doesn't currently have solar + ~self.standardised_asset_list["property_has_solar"] & + # The walls are insulated + ( + self.standardised_asset_list["solar_landlord_walls_insulated"] | + self.standardised_asset_list["solar_epc_walls_insulated"] + ) & + # Roof need loft top-up + self.standardised_asset_list["solar_epc_loft_needs_topup"] & + # Floor is not solid, but is insulated + self.standardised_asset_list["solar_epc_floor_is_other_insulated"] + ) + + # Produce some aggregate figures + self.work_type_figures = { + # Empty cavity from non-intrusives + "Empty Cavity (non-intrusives)": ( + self.standardised_asset_list["non_intrusive_indicates_empty_cavity"].sum() + ), + "Empty Cavity (EPC)": ( + ( + self.standardised_asset_list["epc_indicates_empty_cavity"] & + ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] + ).sum() + ), + "Cavity Extraction": ( + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"].sum() + ), + "Solar PV (Solid Floor)": ( + self.standardised_asset_list["solar_eligible_solid_floor"].sum() + ), + "Solar PV (Solid Floor, Needs Loft Top-up)": ( + self.standardised_asset_list["solar_eligible_solid_floor_needs_loft"].sum() + ), + "Solar PV (Other Floor)": ( + self.standardised_asset_list["solar_eligible_other_floor"].sum() + ), + "Solar PV (Other Floor, Needs Loft Top-up)": ( + self.standardised_asset_list["solar_eligible_other_floor_needs_loft"].sum() + ) + } diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 0de85a27..5960f69b 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -440,7 +440,6 @@ def app(): asset_list.extract_attributes() - # TODO - Use this! import msgpack from utils.s3 import read_from_s3 cleaned = read_from_s3( From 5df47a86ae889b4e26191550f79fc4720f2878a7 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 21 Feb 2025 15:15:58 +0000 Subject: [PATCH 57/72] removed cirular import --- asset_list/mappings/walls.py | 2 - etl/route_march_data_pull/app.py | 126 +------------------------------ 2 files changed, 3 insertions(+), 125 deletions(-) diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py index c5cca599..1fc52fcb 100644 --- a/asset_list/mappings/walls.py +++ b/asset_list/mappings/walls.py @@ -1,5 +1,3 @@ -from asset_list.AssetList import DataRemapper - STANDARD_WALL_CONSTRUCTIONS = { "uninsulated cavity", "filled cavity", "partial insulated cavity", "cavity unknown insulation", "timber frame", "uninsulated solid brick", diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 5960f69b..7bf3cca8 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -4,6 +4,8 @@ import json import pandas as pd import numpy as np from tqdm import tqdm +import msgpack +from utils.s3 import read_from_s3 from asset_list.AssetList import AssetList from asset_list.mappings.property_type import PROPERTY_MAPPING from asset_list.mappings.walls import WALL_CONSTRUCTION_MAPPINGS @@ -440,8 +442,6 @@ def app(): asset_list.extract_attributes() - import msgpack - from utils.s3 import read_from_s3 cleaned = read_from_s3( s3_file_name="cleaned_epc_data/cleaned.bson", bucket_name="retrofit-data-dev" @@ -450,114 +450,7 @@ def app(): asset_list.identify_worktypes(cleaned) - if HAS_NON_INTRUSIVES: - # Empty cavity: - # 1) Has been flagged on the non-intrusives as being empty or partially filled - # 2) The age is before 1995 - # 3) Remove anything that likley has access issues - asset_list["Suitable for Cavity Fill"] = ( - (asset_list["Construction"] == "CAVITY") & - asset_list["Insulated"].isin(["EMPTY", "PARTIAL"]) & - ( - # Shold we defer to the year built provided by the HA? - (asset_list[PROPERTY_YEAR_BUILT] <= 1995) | (asset_list["epc_year_upper_bound"] <= 1995) - ) & - ( - # We check if the property type column contains one of the invalid property types - ~asset_list[PROPERTY_TYPE_COLUMN].str.lower().str.contains("|".join(invalid_property_types_dictionary)) - ) - ) - - # asset_list["Suitable for Extraction"] = - asset_list[ - (asset_list["Construction"] == "Cavity") & - asset_list["Insulated"].isin(["RETRO DRILLED"]) & - ( - (asset_list[PROPERTY_YEAR_BUILT] <= 1995) - ) & - ( - asset_list[PROPERTY_TYPE_COLUMN] - ) - ] - - # 4) Flag properties that look like they're good candidates for solar installs - # Firstly, flag if the fabric is completely done - - insulated_wall_substrings = [ - ", insulated", "with external insulation", "with internal insulation", "filled cavity" - ] - - insulated_roof_substrings = [ - "(another dwelling above)", "limited insulation", "(other premises above)", - ", no insulation", - ] - - def check_solar_insulation_conditions(x): - - if pd.isnull(x["Wall Construction"]): - return None - - if "average thermal transmittance" in x["Wall Construction"].lower(): - # We extract out the u-values - wall_uvalue = extract_thermal_transmittance({}, x["Wall Construction"])[0]["thermal_transmittance"] - roof_uvalue = extract_thermal_transmittance({}, x["Roof Construction"])[0]["thermal_transmittance"] - floor_uvalue = extract_thermal_transmittance({}, x["Floor Construction"])[0]["thermal_transmittance"] - - roof_uvalue = 0 if roof_uvalue is None else roof_uvalue - floor_uvalue = 0 if floor_uvalue is None else floor_uvalue - - # We apply some cutoffs - if wall_uvalue < 0.7 and roof_uvalue < 0.7 and floor_uvalue < 0.7: - return "Walls, Roof and Floor have U-values below 0.7" - - return "Confirm U-values" - - walls_insulated = any( - insulated_substring in x["Wall Construction"].lower() for insulated_substring in insulated_wall_substrings - ) - roof_is_numeric = False - if str(x["Roof Insulation Thickness"]).isdigit(): - roof_is_numeric = True - roof_insulated = int(x["Roof Insulation Thickness"]) >= 200 - else: - roof_insulated = any( - insulated_substring in x["Roof Construction"].lower() for insulated_substring in - insulated_roof_substrings - ) - - floor_is_solid = "solid" in x["Floor Construction"].lower() - - if walls_insulated and roof_insulated and floor_is_solid: - return "Walls Insulated, Roof Insulated, Floor Solid" - - if walls_insulated and floor_is_solid and roof_is_numeric: - return "Walls Insulated, Floor Solid, Loft need top-up" - - return "Not Fully Insulated or no data" - - asset_list["Solar Fabric Condition"] = asset_list.apply(check_solar_insulation_conditions, axis=1) - - asset_list["Good Solar Candidate"] = ( - asset_list["SAP Rating is 75 and below"] & - ~asset_list["Has Solar PV"] & - ( - asset_list["Heating Type"].isin( - [ - "Electric storage heaters", - "Room heaters, electric", - ] - ) | asset_list["Heating Type"].str.contains("heat pump", case=False) - ) & ( - asset_list["Solar Fabric Condition"].isin( - [ - "Walls Insulated, Roof Insulated, Floor Solid", - "Walls, Roof and Floor have U-values below 0.7", - "Walls Insulated, Floor Solid, Loft need top-up" - ] - ) - ) - ) - + # TODO: We should do this breakdown for flats def flat_analysis(asset_list): # We need to deduce the building name - we strip out the house number @@ -596,19 +489,6 @@ def app(): flat_data = flat_analysis(asset_list) - # For all of the columns in transformed_df, prefix with "Recommendation: " - for col in transformed_df.columns: - if col == "row_id": - continue - transformed_df = transformed_df.rename(columns={col: f"Recommendation: {col}"}) - - asset_list = asset_list.merge( - transformed_df, - how="left", - on="row_id" - ) - asset_list = asset_list.drop(columns=["row_id", "index"]) - # Store as an excel filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull.xlsx" # Store the data in two tabs. One for the asset list with the EPC data and the second with the flat data From d86ab5ff8df50e58248bff92582084462fc2166b Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 21 Feb 2025 15:18:53 +0000 Subject: [PATCH 58/72] restructuing app location --- asset_list/app.py | 497 ++++++++++++++++++++ etl/route_march_data_pull/app.py | 502 --------------------- etl/route_march_data_pull/requirements.txt | 0 3 files changed, 497 insertions(+), 502 deletions(-) delete mode 100644 etl/route_march_data_pull/app.py delete mode 100644 etl/route_march_data_pull/requirements.txt diff --git a/asset_list/app.py b/asset_list/app.py index 21b405d8..1a7788fe 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -1 +1,498 @@ import os +import time +import json +import pandas as pd +import numpy as np +from tqdm import tqdm +import msgpack +from utils.s3 import read_from_s3 +from asset_list.AssetList import AssetList +from asset_list.mappings.property_type import PROPERTY_MAPPING +from asset_list.mappings.walls import WALL_CONSTRUCTION_MAPPINGS +from asset_list.mappings.heating_systems import HEATING_MAPPINGS +from asset_list.mappings.exising_pv import EXISTING_PV_MAPPINGS + +from dotenv import load_dotenv +from backend.SearchEpc import SearchEpc +from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + + +def get_data( + asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map, + uprn_column=None, epc_api_only=False, row_id_name="row_id" +): + epc_data = [] + errors = [] + no_epc = [] + for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): + try: + postcode = home[postcode_column] + house_number = str(home[address1_column]).strip() + full_address = home[fulladdress_column].strip() + house_no = SearchEpc.get_house_number(address=str(house_number), postcode=postcode) + if house_no is None: + house_no = house_number + uprn = manual_uprn_map.get(full_address, None) + if uprn is None and home.get(uprn_column): + uprn = home[uprn_column] + + if pd.isnull(uprn): + uprn = None + + searcher = SearchEpc( + address1=str(house_no), + postcode=postcode, + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address, + max_retries=5, + uprn=uprn + ) + # Force the skipping of estimating the EPC + searcher.ordnance_survey_client.property_type = None + searcher.ordnance_survey_client.built_form = None + + searcher.find_property(skip_os=True) + + # Check if we have a flat or appartment + if searcher.newest_epc is None and uprn is None: + # Try again: + if SearchEpc.get_house_number(address=str(house_number), postcode=postcode) is None: + # Backup + add1 = full_address.split(",") + if len(add1) > 1: + add1 = add1[1].strip() + else: + # Try splitting on space + add1 = full_address.split(" ")[0].strip() + + else: + add1 = str(house_number) + searcher = SearchEpc( + address1=add1, + postcode=postcode, + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address, + max_retries=5 + ) + + if ( + "flat" in house_number.lower() or "apartment" in house_number.lower() or "apt" in + house_number.lower() + ): + searcher.ordnance_survey_client.property_type = "Flat" + + searcher.find_property(skip_os=True) + + if searcher.newest_epc is None: + no_epc.append(home[row_id_name]) + continue + + if epc_api_only: + epc = { + row_id_name: home[row_id_name], + **searcher.newest_epc.copy() + } + + epc_data.append(epc) + continue + + # Look for EPC recommendatons + try: + property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) + except: + property_recommendations = {"rows": []} + + # Retrieve data from FindMyEPC + try: + find_epc_searcher = RetrieveFindMyEpc( + address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"] + ) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + except ValueError as e: + if "No EPC found" in str(e) and "address1" in searcher.newest_epc: + try: + find_epc_searcher = RetrieveFindMyEpc( + address=searcher.newest_epc["address1"], postcode=searcher.newest_epc["postcode"] + ) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + except ValueError as e: + if "No EPC found" in str(e): + find_epc_data = {} + else: + find_epc_data = {} + except Exception as e: + raise Exception(f"Error retrieving FindMyEPC data: {e}") + time.sleep(np.random.uniform(0.1, 1)) + + epc = { + row_id_name: home[row_id_name], + **searcher.newest_epc.copy(), + "recommendations": property_recommendations["rows"], + "find_my_epc_data": find_epc_data, + } + + epc_data.append(epc) + except Exception as e: + errors.append(home[row_id_name]) + time.sleep(5) + + return epc_data, errors, no_epc + + +def extract_address1(asset_list, full_address_col, postcode_col, method="first_two_words"): + if method == "first_two_words": + asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ") + return asset_list + + if method == "first_word": + asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0] + return asset_list + + if method == "house_number_extraction": + asset_list["address1_extracted"] = asset_list.apply( + lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]), + axis=1 + ) + return asset_list + + raise ValueError(f"Method {method} not recognized") + + +def app(): + """ + This app is EPC pulling data for some properties owned by Livewest + + Data request contents: + Date of last EPC + Reason for EPC + SAP score on register + Property Type + Property Area + Property Age + Any Dimensions (HLP,PW,RH) + Property Wall Construction + Heating Type + Secondary Heating + Loft Insulation Depth + + Additional if possible: + Heat loss calculations + EPC recommendations + Property UPRN + """ + + # TODO: + # For cavity work: + # - Flag any entries that have a different wall type between non-intrusive data against EPC + # - Worth double checking entries that have a difference in wall construction + # - Look at anything that is flagged as an empty cavity but the EPC data says it’s a filled cavity + # - Look at the current EPC scores - Anything that is C75 or above, especially if it’s assumed no insulation + # - By postcode, we can try and deduce if all of the addresses are a flats and then estimate if 50% of the flats + # are less than C75 + # - Flag anything pre SAP2012 + # - Flag anything over 5 years old + # - Look at year built vs age band + # + # For Solar: + # - Discount any that have solar PV - based on non-intrusives and from the inspections team + # - In the heating, discount anything that isn’t ashp, ghsp, hhrs, electric storage - possibly homes with + # electric room heaters but it might need to be an EPC E + # - Fabric - check the floor, wall and roof: + # - Filled or empty cavity is good + # - Insulated solid/timber/system built is good + # - SCIS/CEG needs solid floors + # - JJC don’t care + # - Anything with a loft 200 or below + # - Anything C75 and above won’t qualify + # - Insulated loft = 200mm + # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) + # - Or the insulation required is loft/cavity (floors should be solid) + + # For Westward + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward" + DATA_FILENAME = "WESTWARD - completed list..xlsx" + SHEET_NAME = "Sheet1" + + POSTCODE_COLUMN = "WFT EDIT Postcode" + FULLADDRESS_COLUMN = "Address" + ADDRESS1_COLUMN = None + ADDRESS1_METHOD = "house_number_extraction" + + ADDRESS_COLS_TO_CONCAT = [] + MISSING_POSTCODES_METHOD = None + PROPERTY_YEAR_BUILT = "Build date" + UPRN_COLUMN = "UPRN" + # If we have the non-intrusives data, this should be true + HAS_NON_INTRUSIVES = True + PROPERTY_TYPE_COLUMN = "Location type" # This will be used to identify and remove bedsits + + # Maps addresses to uprn in problematic cases + MANUAL_UPRN_MAP = {} + + asset_list = AssetList( + local_filepath=os.path.join(DATA_FOLDER, DATA_FILENAME), + header=0, + sheet_name=SHEET_NAME, + address1_colname=ADDRESS1_COLUMN, + postcode_colname=POSTCODE_COLUMN, + landlord_property_id="UPRN", + full_address_colname=FULLADDRESS_COLUMN, + full_address_cols_to_concat=ADDRESS_COLS_TO_CONCAT, + missing_postcodes_method=MISSING_POSTCODES_METHOD, + address1_extraction_method=ADDRESS1_METHOD, + landlord_year_built=PROPERTY_YEAR_BUILT, + landlord_uprn=UPRN_COLUMN, + landlord_property_type=PROPERTY_TYPE_COLUMN, + landlord_wall_construction="Wall Construction (EPC)", + landlord_heating_system="Heat Source", + landlord_existing_pv="PV (Y/N)" + ) + asset_list.init_standardise() + + # We produce the new maps, which can be saved for future useage + + new_property_type_map = PROPERTY_MAPPING.copy().update( + asset_list.variable_mappings[asset_list.landlord_property_type] + ) + new_wall_map = WALL_CONSTRUCTION_MAPPINGS.copy().update( + asset_list.variable_mappings[asset_list.landlord_wall_construction] + ) + new_heating_map = HEATING_MAPPINGS.copy().update( + asset_list.variable_mappings[asset_list.landlord_heating_system] + ) + new_existing_pv_map = EXISTING_PV_MAPPINGS.copy().update( + asset_list.variable_mappings[asset_list.landlord_existing_pv] + ) + + asset_list.apply_standardiation() + + # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" + # DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx" + # SHEET_NAME = "Sheet1" + # POSTCODE_COLUMN = 'Full Address.1' + # FULLADDRESS_COLUMN = "Full Address" + # ADDRESS1_COLUMN = None + # ADDRESS1_METHOD = "first_word" + # ADDRESS_COLS_TO_CONCAT = [] + # MISSING_POSTCODES_METHOD = None + # PROPERTY_YEAR_BUILT = "Build Date" + # UPRN_COLUMN = None + # # If we have the non-intrusives data, this should be true + # HAS_NON_INTRUSIVES = True + + ### We retrieve the EPC data + + # We chunk up this data into 5000 rows at a time + # Create the chunks directory + force_retrieve_data = False + skip = None # Used to skip already completed chunks + chunk_size = 5000 + filename = "Chunk {i}.csv" + download_folder = os.path.join(DATA_FOLDER, "Chunks") + if not os.path.exists(download_folder): + os.makedirs(download_folder) + + chunk_indexes = list(range(0, len(asset_list.standardised_asset_list), chunk_size)) + downloaded_files = {filename.format(i=i) for i in chunk_indexes} + + # We check if we have files associated to these files already and if we do, and we do not want to force the + # fetching of the data, we skip + folder_contents = os.listdir(download_folder) + if all(x in folder_contents for x in downloaded_files): + skip = max(chunk_indexes) + + for i in range(0, len(asset_list.standardised_asset_list), chunk_size): + print(f"Processing chunk {i} to {i + chunk_size}") + if skip is not None and not force_retrieve_data: + if i <= skip: + continue + chunk = asset_list.standardised_asset_list[i:i + chunk_size] + epc_data_chunk, errors_chunk, no_epc_chunk = get_data( + asset_list=chunk, + row_id_name=asset_list.DOMNA_PROPERTY_ID, + fulladdress_column=asset_list.STANDARD_FULL_ADDRESS, + address1_column=asset_list.STANDARD_ADDRESS_1, + postcode_column=asset_list.STANDARD_POSTCODE, + manual_uprn_map=MANUAL_UPRN_MAP, + uprn_column=asset_list.STANDARD_UPRN + ) + + # We now retrieve any failed properties + chunk_failed = chunk[chunk[asset_list.DOMNA_PROPERTY_ID].isin(errors_chunk)] + epc_data_failed, _, _ = get_data( + asset_list=chunk_failed, + row_id_name=asset_list.DOMNA_PROPERTY_ID, + fulladdress_column=FULLADDRESS_COLUMN, + address1_column=ADDRESS1_COLUMN, + postcode_column=POSTCODE_COLUMN, + manual_uprn_map=MANUAL_UPRN_MAP, + epc_api_only=False + ) + + epc_data_chunk.extend(epc_data_failed) + + # Append the failed data to the main data + # Store the chunk locally as a csv + pd.DataFrame(epc_data_chunk).to_csv(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i}.csv"), index=False) + # Store the errors and no-data locally + with open(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i} errors.json"), "w") as f: + json.dump(errors_chunk, f) + + with open(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i} nodata.csv"), "w") as f: + json.dump(no_epc_chunk, f) + + # We read in and concatenate the created created chunks + # List the contents + epc_data = [] + for file in downloaded_files: + csv_data = pd.read_csv(os.path.join(download_folder, file)) + # We need to convert the recommendations back to a list + csv_data["recommendations"] = csv_data["recommendations"].apply(eval) + csv_data["find_my_epc_data"] = csv_data["find_my_epc_data"].apply(eval) + epc_data.append(csv_data) + + epc_df = pd.concat(epc_data) + # TODO: TEMP!!! + epc_df = epc_df.rename(columns={"row_id": asset_list.DOMNA_PROPERTY_ID}) + + # We expand out the recommendations + recommendations_df = epc_df[[asset_list.DOMNA_PROPERTY_ID, "recommendations"]] + + unique_recommendations = set() + for _, row in recommendations_df.iterrows(): + unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]]) + + columns = [asset_list.DOMNA_PROPERTY_ID] + list(unique_recommendations) + transformed_data = [] + for _, row in recommendations_df.iterrows(): + # Initialize a dictionary for this row with False for all recommendations + row_data = {col: False for col in columns} + row_data[asset_list.DOMNA_PROPERTY_ID] = row[asset_list.DOMNA_PROPERTY_ID] + + # Set True for each recommendation present in this row + for rec in row["recommendations"]: + recommendation_text = rec["improvement-summary-text"] + row_data[recommendation_text] = True + + # Append the row data to transformed_data + transformed_data.append(row_data) + + transformed_df = pd.DataFrame(transformed_data) + transformed_df = transformed_df[ + [ + asset_list.DOMNA_PROPERTY_ID, "Floor insulation (solid floor)", + "Floor insulation", "Floor insulation (suspended floor)" + ] + ] + + transformed_df["epc_has_floor_recommendation"] = ( + transformed_df["Floor insulation (solid floor)"] | transformed_df["Floor insulation"] | + transformed_df["Floor insulation (suspended floor)"] + ) + + # Get the find my epc data + find_my_epc_data = epc_df[[asset_list.DOMNA_PROPERTY_ID, "find_my_epc_data"]].drop( + columns=["find_my_epc_data"]).join( + pd.json_normalize(epc_df["find_my_epc_data"]) + ) + find_my_epc_data = find_my_epc_data.merge( + transformed_df[[asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"]], + how="left", on=asset_list.DOMNA_PROPERTY_ID + ) + + # We check if we get the solar pv column: + if "Solar photovoltaics" not in find_my_epc_data.columns: + find_my_epc_data["Solar photovoltaics"] = False + + # Retrieve just the data we need + epc_df = epc_df[ + [asset_list.DOMNA_PROPERTY_ID] + list(asset_list.EPC_API_DATA_NAMES.keys()) + ].rename( + columns=asset_list.EPC_API_DATA_NAMES + ) + + epc_df = epc_df.merge( + find_my_epc_data[ + [asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"] + list(asset_list.FIND_EPC_DATA_NAMES.keys()) + ] + .rename(columns=asset_list.FIND_EPC_DATA_NAMES), + how="left", + on=asset_list.DOMNA_PROPERTY_ID + ) + + asset_list.merge_data(epc_df) + # TODO: TEMP!!! + epc_df["epc_os_uprn"] = epc_df["epc_os_uprn"].astype("Int64").astype(str) + asset_list.standardised_asset_list = asset_list.standardised_asset_list.merge( + epc_df.drop(columns=["domna_property_id"]), how="left", left_on="ordnance_survey_uprn", right_on="epc_os_uprn" + ) + + asset_list.extract_attributes() + + cleaned = read_from_s3( + s3_file_name="cleaned_epc_data/cleaned.bson", + bucket_name="retrofit-data-dev" + ) + cleaned = msgpack.unpackb(cleaned, raw=False) + + asset_list.identify_worktypes(cleaned) + + # TODO: We should do this breakdown for flats + def flat_analysis(asset_list): + + # We need to deduce the building name - we strip out the house number + def extract_building_name(x): + # TODO: This doesn't really work + if pd.isnull(x): + return None + house_no = SearchEpc.get_house_number(address=x, postcode=None) + if house_no: + return x.replace(house_no, "").strip() + return x.split(",")[0].strip() + + # We want to deduce if flats have 50% of the properties below C75 + # We group by postcode and property type + grouped = asset_list.groupby([POSTCODE_COLUMN, "Property Type"]) + + flat_data = [] + for _, group in grouped: + if "flat" in group["Property Type"].str.lower().values: + num_flats = group["Property Type"].str.lower().value_counts().get("flat", 0) + num_below_c75 = group["SAP score on register"].lt(75).sum() + + flat_data.append( + { + "Postcode": group[POSTCODE_COLUMN].iloc[0], + "Property Type": "Flat", + "Number of Flats with EPC": num_flats, + "Number of Flats below C75": num_below_c75, + "Proportion of Flat EPCs below C75": round(100 * num_below_c75 / num_flats) + } + ) + + flat_data = pd.DataFrame(flat_data) + + return flat_data + + flat_data = flat_analysis(asset_list) + + # Store as an excel + filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull.xlsx" + # Store the data in two tabs. One for the asset list with the EPC data and the second with the flat data + + with pd.ExcelWriter(filename) as writer: + asset_list.to_excel(writer, sheet_name="EPC Data", index=False) + flat_data.to_excel(writer, sheet_name="Flat Data", index=False) + + matches_review = asset_list[ + [FULLADDRESS_COLUMN, ADDRESS1_COLUMN, POSTCODE_COLUMN, "Address on EPC", "Postcode on EPC"] + ] diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py deleted file mode 100644 index 7bf3cca8..00000000 --- a/etl/route_march_data_pull/app.py +++ /dev/null @@ -1,502 +0,0 @@ -import os -import time -import json -import pandas as pd -import numpy as np -from tqdm import tqdm -import msgpack -from utils.s3 import read_from_s3 -from asset_list.AssetList import AssetList -from asset_list.mappings.property_type import PROPERTY_MAPPING -from asset_list.mappings.walls import WALL_CONSTRUCTION_MAPPINGS -from asset_list.mappings.heating_systems import HEATING_MAPPINGS -from asset_list.mappings.exising_pv import EXISTING_PV_MAPPINGS - -from dotenv import load_dotenv -from backend.SearchEpc import SearchEpc -from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc - -from etl.epc_clean.epc_attributes.attribute_utils import ( - extract_thermal_transmittance -) - -load_dotenv(dotenv_path="backend/.env") -EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") - - -def get_data( - asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map, - uprn_column=None, epc_api_only=False, row_id_name="row_id" -): - epc_data = [] - errors = [] - no_epc = [] - for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): - try: - postcode = home[postcode_column] - house_number = str(home[address1_column]).strip() - full_address = home[fulladdress_column].strip() - house_no = SearchEpc.get_house_number(address=str(house_number), postcode=postcode) - if house_no is None: - house_no = house_number - uprn = manual_uprn_map.get(full_address, None) - if uprn is None and home.get(uprn_column): - uprn = home[uprn_column] - - if pd.isnull(uprn): - uprn = None - - searcher = SearchEpc( - address1=str(house_no), - postcode=postcode, - auth_token=EPC_AUTH_TOKEN, - os_api_key="", - property_type=None, - fast=True, - full_address=full_address, - max_retries=5, - uprn=uprn - ) - # Force the skipping of estimating the EPC - searcher.ordnance_survey_client.property_type = None - searcher.ordnance_survey_client.built_form = None - - searcher.find_property(skip_os=True) - - # Check if we have a flat or appartment - if searcher.newest_epc is None and uprn is None: - # Try again: - if SearchEpc.get_house_number(address=str(house_number), postcode=postcode) is None: - # Backup - add1 = full_address.split(",") - if len(add1) > 1: - add1 = add1[1].strip() - else: - # Try splitting on space - add1 = full_address.split(" ")[0].strip() - - else: - add1 = str(house_number) - searcher = SearchEpc( - address1=add1, - postcode=postcode, - auth_token=EPC_AUTH_TOKEN, - os_api_key="", - property_type=None, - fast=True, - full_address=full_address, - max_retries=5 - ) - - if ( - "flat" in house_number.lower() or "apartment" in house_number.lower() or "apt" in - house_number.lower() - ): - searcher.ordnance_survey_client.property_type = "Flat" - - searcher.find_property(skip_os=True) - - if searcher.newest_epc is None: - no_epc.append(home[row_id_name]) - continue - - if epc_api_only: - epc = { - row_id_name: home[row_id_name], - **searcher.newest_epc.copy() - } - - epc_data.append(epc) - continue - - # Look for EPC recommendatons - try: - property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) - except: - property_recommendations = {"rows": []} - - # Retrieve data from FindMyEPC - try: - find_epc_searcher = RetrieveFindMyEpc( - address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"] - ) - find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() - except ValueError as e: - if "No EPC found" in str(e) and "address1" in searcher.newest_epc: - try: - find_epc_searcher = RetrieveFindMyEpc( - address=searcher.newest_epc["address1"], postcode=searcher.newest_epc["postcode"] - ) - find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() - except ValueError as e: - if "No EPC found" in str(e): - find_epc_data = {} - else: - find_epc_data = {} - except Exception as e: - raise Exception(f"Error retrieving FindMyEPC data: {e}") - time.sleep(np.random.uniform(0.1, 1)) - - epc = { - row_id_name: home[row_id_name], - **searcher.newest_epc.copy(), - "recommendations": property_recommendations["rows"], - "find_my_epc_data": find_epc_data, - } - - epc_data.append(epc) - except Exception as e: - errors.append(home[row_id_name]) - time.sleep(5) - - return epc_data, errors, no_epc - - -def extract_address1(asset_list, full_address_col, postcode_col, method="first_two_words"): - if method == "first_two_words": - asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ") - return asset_list - - if method == "first_word": - asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0] - return asset_list - - if method == "house_number_extraction": - asset_list["address1_extracted"] = asset_list.apply( - lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]), - axis=1 - ) - return asset_list - - raise ValueError(f"Method {method} not recognized") - - -def app(): - """ - This app is EPC pulling data for some properties owned by Livewest - - Data request contents: - Date of last EPC - Reason for EPC - SAP score on register - Property Type - Property Area - Property Age - Any Dimensions (HLP,PW,RH) - Property Wall Construction - Heating Type - Secondary Heating - Loft Insulation Depth - - Additional if possible: - Heat loss calculations - EPC recommendations - Property UPRN - """ - - # TODO: - # For cavity work: - # - Flag any entries that have a different wall type between non-intrusive data against EPC - # - Worth double checking entries that have a difference in wall construction - # - Look at anything that is flagged as an empty cavity but the EPC data says it’s a filled cavity - # - Look at the current EPC scores - Anything that is C75 or above, especially if it’s assumed no insulation - # - By postcode, we can try and deduce if all of the addresses are a flats and then estimate if 50% of the flats - # are less than C75 - # - Flag anything pre SAP2012 - # - Flag anything over 5 years old - # - Look at year built vs age band - # - # For Solar: - # - Discount any that have solar PV - based on non-intrusives and from the inspections team - # - In the heating, discount anything that isn’t ashp, ghsp, hhrs, electric storage - possibly homes with - # electric room heaters but it might need to be an EPC E - # - Fabric - check the floor, wall and roof: - # - Filled or empty cavity is good - # - Insulated solid/timber/system built is good - # - SCIS/CEG needs solid floors - # - JJC don’t care - # - Anything with a loft 200 or below - # - Anything C75 and above won’t qualify - # - Insulated loft = 200mm - # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) - # - Or the insulation required is loft/cavity (floors should be solid) - - # For Westward - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward" - DATA_FILENAME = "WESTWARD - completed list..xlsx" - SHEET_NAME = "Sheet1" - - POSTCODE_COLUMN = "WFT EDIT Postcode" - FULLADDRESS_COLUMN = "Address" - ADDRESS1_COLUMN = None - ADDRESS1_METHOD = "house_number_extraction" - - ADDRESS_COLS_TO_CONCAT = [] - MISSING_POSTCODES_METHOD = None - PROPERTY_YEAR_BUILT = "Build date" - UPRN_COLUMN = "UPRN" - # If we have the non-intrusives data, this should be true - HAS_NON_INTRUSIVES = True - PROPERTY_TYPE_COLUMN = "Location type" # This will be used to identify and remove bedsits - - # Maps addresses to uprn in problematic cases - MANUAL_UPRN_MAP = {} - - asset_list = AssetList( - local_filepath=os.path.join(DATA_FOLDER, DATA_FILENAME), - header=0, - sheet_name=SHEET_NAME, - address1_colname=ADDRESS1_COLUMN, - postcode_colname=POSTCODE_COLUMN, - landlord_property_id="UPRN", - full_address_colname=FULLADDRESS_COLUMN, - full_address_cols_to_concat=ADDRESS_COLS_TO_CONCAT, - missing_postcodes_method=MISSING_POSTCODES_METHOD, - address1_extraction_method=ADDRESS1_METHOD, - landlord_year_built=PROPERTY_YEAR_BUILT, - landlord_uprn=UPRN_COLUMN, - landlord_property_type=PROPERTY_TYPE_COLUMN, - landlord_wall_construction="Wall Construction (EPC)", - landlord_heating_system="Heat Source", - landlord_existing_pv="PV (Y/N)" - ) - asset_list.init_standardise() - - # We produce the new maps, which can be saved for future useage - - new_property_type_map = PROPERTY_MAPPING.copy().update( - asset_list.variable_mappings[asset_list.landlord_property_type] - ) - new_wall_map = WALL_CONSTRUCTION_MAPPINGS.copy().update( - asset_list.variable_mappings[asset_list.landlord_wall_construction] - ) - new_heating_map = HEATING_MAPPINGS.copy().update( - asset_list.variable_mappings[asset_list.landlord_heating_system] - ) - new_existing_pv_map = EXISTING_PV_MAPPINGS.copy().update( - asset_list.variable_mappings[asset_list.landlord_existing_pv] - ) - - asset_list.apply_standardiation() - - # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" - # DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx" - # SHEET_NAME = "Sheet1" - # POSTCODE_COLUMN = 'Full Address.1' - # FULLADDRESS_COLUMN = "Full Address" - # ADDRESS1_COLUMN = None - # ADDRESS1_METHOD = "first_word" - # ADDRESS_COLS_TO_CONCAT = [] - # MISSING_POSTCODES_METHOD = None - # PROPERTY_YEAR_BUILT = "Build Date" - # UPRN_COLUMN = None - # # If we have the non-intrusives data, this should be true - # HAS_NON_INTRUSIVES = True - - ### We retrieve the EPC data - - # We chunk up this data into 5000 rows at a time - # Create the chunks directory - force_retrieve_data = False - skip = None # Used to skip already completed chunks - chunk_size = 5000 - filename = "Chunk {i}.csv" - download_folder = os.path.join(DATA_FOLDER, "Chunks") - if not os.path.exists(download_folder): - os.makedirs(download_folder) - - chunk_indexes = list(range(0, len(asset_list.standardised_asset_list), chunk_size)) - downloaded_files = {filename.format(i=i) for i in chunk_indexes} - - # We check if we have files associated to these files already and if we do, and we do not want to force the - # fetching of the data, we skip - folder_contents = os.listdir(download_folder) - if all(x in folder_contents for x in downloaded_files): - skip = max(chunk_indexes) - - for i in range(0, len(asset_list.standardised_asset_list), chunk_size): - print(f"Processing chunk {i} to {i + chunk_size}") - if skip is not None and not force_retrieve_data: - if i <= skip: - continue - chunk = asset_list.standardised_asset_list[i:i + chunk_size] - epc_data_chunk, errors_chunk, no_epc_chunk = get_data( - asset_list=chunk, - row_id_name=asset_list.DOMNA_PROPERTY_ID, - fulladdress_column=asset_list.STANDARD_FULL_ADDRESS, - address1_column=asset_list.STANDARD_ADDRESS_1, - postcode_column=asset_list.STANDARD_POSTCODE, - manual_uprn_map=MANUAL_UPRN_MAP, - uprn_column=asset_list.STANDARD_UPRN - ) - - # We now retrieve any failed properties - chunk_failed = chunk[chunk[asset_list.DOMNA_PROPERTY_ID].isin(errors_chunk)] - epc_data_failed, _, _ = get_data( - asset_list=chunk_failed, - row_id_name=asset_list.DOMNA_PROPERTY_ID, - fulladdress_column=FULLADDRESS_COLUMN, - address1_column=ADDRESS1_COLUMN, - postcode_column=POSTCODE_COLUMN, - manual_uprn_map=MANUAL_UPRN_MAP, - epc_api_only=False - ) - - epc_data_chunk.extend(epc_data_failed) - - # Append the failed data to the main data - # Store the chunk locally as a csv - pd.DataFrame(epc_data_chunk).to_csv(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i}.csv"), index=False) - # Store the errors and no-data locally - with open(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i} errors.json"), "w") as f: - json.dump(errors_chunk, f) - - with open(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i} nodata.csv"), "w") as f: - json.dump(no_epc_chunk, f) - - # We read in and concatenate the created created chunks - # List the contents - epc_data = [] - for file in downloaded_files: - csv_data = pd.read_csv(os.path.join(download_folder, file)) - # We need to convert the recommendations back to a list - csv_data["recommendations"] = csv_data["recommendations"].apply(eval) - csv_data["find_my_epc_data"] = csv_data["find_my_epc_data"].apply(eval) - epc_data.append(csv_data) - - epc_df = pd.concat(epc_data) - # TODO: TEMP!!! - epc_df = epc_df.rename(columns={"row_id": asset_list.DOMNA_PROPERTY_ID}) - - # We expand out the recommendations - recommendations_df = epc_df[[asset_list.DOMNA_PROPERTY_ID, "recommendations"]] - - unique_recommendations = set() - for _, row in recommendations_df.iterrows(): - unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]]) - - columns = [asset_list.DOMNA_PROPERTY_ID] + list(unique_recommendations) - transformed_data = [] - for _, row in recommendations_df.iterrows(): - # Initialize a dictionary for this row with False for all recommendations - row_data = {col: False for col in columns} - row_data[asset_list.DOMNA_PROPERTY_ID] = row[asset_list.DOMNA_PROPERTY_ID] - - # Set True for each recommendation present in this row - for rec in row["recommendations"]: - recommendation_text = rec["improvement-summary-text"] - row_data[recommendation_text] = True - - # Append the row data to transformed_data - transformed_data.append(row_data) - - transformed_df = pd.DataFrame(transformed_data) - transformed_df = transformed_df[ - [ - asset_list.DOMNA_PROPERTY_ID, "Floor insulation (solid floor)", - "Floor insulation", "Floor insulation (suspended floor)" - ] - ] - - transformed_df["epc_has_floor_recommendation"] = ( - transformed_df["Floor insulation (solid floor)"] | transformed_df["Floor insulation"] | - transformed_df["Floor insulation (suspended floor)"] - ) - - # Get the find my epc data - find_my_epc_data = epc_df[[asset_list.DOMNA_PROPERTY_ID, "find_my_epc_data"]].drop( - columns=["find_my_epc_data"]).join( - pd.json_normalize(epc_df["find_my_epc_data"]) - ) - find_my_epc_data = find_my_epc_data.merge( - transformed_df[[asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"]], - how="left", on=asset_list.DOMNA_PROPERTY_ID - ) - - # We check if we get the solar pv column: - if "Solar photovoltaics" not in find_my_epc_data.columns: - find_my_epc_data["Solar photovoltaics"] = False - - # Retrieve just the data we need - epc_df = epc_df[ - [asset_list.DOMNA_PROPERTY_ID] + list(asset_list.EPC_API_DATA_NAMES.keys()) - ].rename( - columns=asset_list.EPC_API_DATA_NAMES - ) - - epc_df = epc_df.merge( - find_my_epc_data[ - [asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"] + list(asset_list.FIND_EPC_DATA_NAMES.keys()) - ] - .rename(columns=asset_list.FIND_EPC_DATA_NAMES), - how="left", - on=asset_list.DOMNA_PROPERTY_ID - ) - - asset_list.merge_data(epc_df) - # TODO: TEMP!!! - epc_df["epc_os_uprn"] = epc_df["epc_os_uprn"].astype("Int64").astype(str) - asset_list.standardised_asset_list = asset_list.standardised_asset_list.merge( - epc_df.drop(columns=["domna_property_id"]), how="left", left_on="ordnance_survey_uprn", right_on="epc_os_uprn" - ) - - asset_list.extract_attributes() - - cleaned = read_from_s3( - s3_file_name="cleaned_epc_data/cleaned.bson", - bucket_name="retrofit-data-dev" - ) - cleaned = msgpack.unpackb(cleaned, raw=False) - - asset_list.identify_worktypes(cleaned) - - # TODO: We should do this breakdown for flats - def flat_analysis(asset_list): - - # We need to deduce the building name - we strip out the house number - def extract_building_name(x): - # TODO: This doesn't really work - if pd.isnull(x): - return None - house_no = SearchEpc.get_house_number(address=x, postcode=None) - if house_no: - return x.replace(house_no, "").strip() - return x.split(",")[0].strip() - - # We want to deduce if flats have 50% of the properties below C75 - # We group by postcode and property type - grouped = asset_list.groupby([POSTCODE_COLUMN, "Property Type"]) - - flat_data = [] - for _, group in grouped: - if "flat" in group["Property Type"].str.lower().values: - num_flats = group["Property Type"].str.lower().value_counts().get("flat", 0) - num_below_c75 = group["SAP score on register"].lt(75).sum() - - flat_data.append( - { - "Postcode": group[POSTCODE_COLUMN].iloc[0], - "Property Type": "Flat", - "Number of Flats with EPC": num_flats, - "Number of Flats below C75": num_below_c75, - "Proportion of Flat EPCs below C75": round(100 * num_below_c75 / num_flats) - } - ) - - flat_data = pd.DataFrame(flat_data) - - return flat_data - - flat_data = flat_analysis(asset_list) - - # Store as an excel - filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull.xlsx" - # Store the data in two tabs. One for the asset list with the EPC data and the second with the flat data - - with pd.ExcelWriter(filename) as writer: - asset_list.to_excel(writer, sheet_name="EPC Data", index=False) - flat_data.to_excel(writer, sheet_name="Flat Data", index=False) - - matches_review = asset_list[ - [FULLADDRESS_COLUMN, ADDRESS1_COLUMN, POSTCODE_COLUMN, "Address on EPC", "Postcode on EPC"] - ] diff --git a/etl/route_march_data_pull/requirements.txt b/etl/route_march_data_pull/requirements.txt deleted file mode 100644 index e69de29b..00000000 From 759e81f6606ee9355612ed9526acd8c77dc12096 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 21 Feb 2025 15:25:38 +0000 Subject: [PATCH 59/72] refactoring --- asset_list/app.py | 20 +++++++++++++++----- asset_list/requirements.txt | 3 ++- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/asset_list/app.py b/asset_list/app.py index 1a7788fe..df2fe9cc 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -21,13 +21,21 @@ EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") def get_data( - asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map, + df, fulladdress_column, address1_column, postcode_column, manual_uprn_map, uprn_column=None, epc_api_only=False, row_id_name="row_id" ): + # These re-map the standard property types to forms accepted by the EPC api, so we can predict EPCs + property_type_map = { + "house": "House", + "flat": "Flat", + "maisonette": "Maisonette", + "bungalow": "Bungalow", + } + epc_data = [] errors = [] no_epc = [] - for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): + for _, home in tqdm(df.iterrows(), total=len(df)): try: postcode = home[postcode_column] house_number = str(home[address1_column]).strip() @@ -42,19 +50,21 @@ def get_data( if pd.isnull(uprn): uprn = None + property_type = property_type_map.get(home[AssetList.STANDARD_PROPERTY_TYPE], None) + searcher = SearchEpc( address1=str(house_no), postcode=postcode, auth_token=EPC_AUTH_TOKEN, os_api_key="", - property_type=None, + property_type=property_type, fast=True, full_address=full_address, max_retries=5, uprn=uprn ) # Force the skipping of estimating the EPC - searcher.ordnance_survey_client.property_type = None + searcher.ordnance_survey_client.property_type = property_type searcher.ordnance_survey_client.built_form = None searcher.find_property(skip_os=True) @@ -317,7 +327,7 @@ def app(): continue chunk = asset_list.standardised_asset_list[i:i + chunk_size] epc_data_chunk, errors_chunk, no_epc_chunk = get_data( - asset_list=chunk, + df=chunk, row_id_name=asset_list.DOMNA_PROPERTY_ID, fulladdress_column=asset_list.STANDARD_FULL_ADDRESS, address1_column=asset_list.STANDARD_ADDRESS_1, diff --git a/asset_list/requirements.txt b/asset_list/requirements.txt index fd045d46..fd43ac64 100644 --- a/asset_list/requirements.txt +++ b/asset_list/requirements.txt @@ -8,4 +8,5 @@ boto3 openpyxl openai tiktoken -msgpack \ No newline at end of file +msgpack +beautifulsoup4 \ No newline at end of file From 33558957df5b718fd81f9a89064f24ceffa2b139 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 21 Feb 2025 17:00:12 +0000 Subject: [PATCH 60/72] adding methodology to estimate the EPC if we don't have it --- asset_list/app.py | 22 +++++++++++++--------- backend/SearchEpc.py | 2 +- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/asset_list/app.py b/asset_list/app.py index df2fe9cc..5bbf25d4 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -21,9 +21,13 @@ EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") def get_data( - df, fulladdress_column, address1_column, postcode_column, manual_uprn_map, - uprn_column=None, epc_api_only=False, row_id_name="row_id" + df, manual_uprn_map, epc_api_only=False, row_id_name="row_id" ): + uprn_column = AssetList.STANDARD_UPRN + fulladdress_column = AssetList.STANDARD_FULL_ADDRESS + address1_column = AssetList.STANDARD_ADDRESS_1 + postcode_column = AssetList.STANDARD_POSTCODE + # These re-map the standard property types to forms accepted by the EPC api, so we can predict EPCs property_type_map = { "house": "House", @@ -57,14 +61,14 @@ def get_data( postcode=postcode, auth_token=EPC_AUTH_TOKEN, os_api_key="", - property_type=property_type, + property_type=None, fast=True, full_address=full_address, max_retries=5, uprn=uprn ) # Force the skipping of estimating the EPC - searcher.ordnance_survey_client.property_type = property_type + searcher.ordnance_survey_client.property_type = None searcher.ordnance_survey_client.built_form = None searcher.find_property(skip_os=True) @@ -102,6 +106,11 @@ def get_data( searcher.find_property(skip_os=True) + # As a final resort, we estimate the EPC + if property_type is not None: + searcher.ordnance_survey_client.property_type = property_type + searcher.find_property(skip_os=True) + if searcher.newest_epc is None: no_epc.append(home[row_id_name]) continue @@ -328,12 +337,7 @@ def app(): chunk = asset_list.standardised_asset_list[i:i + chunk_size] epc_data_chunk, errors_chunk, no_epc_chunk = get_data( df=chunk, - row_id_name=asset_list.DOMNA_PROPERTY_ID, - fulladdress_column=asset_list.STANDARD_FULL_ADDRESS, - address1_column=asset_list.STANDARD_ADDRESS_1, - postcode_column=asset_list.STANDARD_POSTCODE, manual_uprn_map=MANUAL_UPRN_MAP, - uprn_column=asset_list.STANDARD_UPRN ) # We now retrieve any failed properties diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index 79a041ec..0d921bec 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -337,7 +337,7 @@ class SearchEpc: if row["lmk-key"] not in seen and not seen.add(row["lmk-key"]) ] - if data: + if data["rows"]: api_response["msg"] = self.SUCCESS return api_response["msg"] From d69baa21dab3c066b20b3823f9bac52da4eba7da Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 21 Feb 2025 17:22:00 +0000 Subject: [PATCH 61/72] estimating epcs --- asset_list/app.py | 1 + 1 file changed, 1 insertion(+) diff --git a/asset_list/app.py b/asset_list/app.py index 5bbf25d4..229bf171 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -337,6 +337,7 @@ def app(): chunk = asset_list.standardised_asset_list[i:i + chunk_size] epc_data_chunk, errors_chunk, no_epc_chunk = get_data( df=chunk, + row_id_name=asset_list.DOMNA_PROPERTY_ID, manual_uprn_map=MANUAL_UPRN_MAP, ) From d1dc536ab0c4424ac6fda9c39422659a547e8fbe Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 21 Feb 2025 17:33:18 +0000 Subject: [PATCH 62/72] merging on epc data --- asset_list/AssetList.py | 2 +- asset_list/app.py | 7 ------- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index ffe53d40..2d224daa 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -751,7 +751,7 @@ class AssetList: # We produce some additional fields # 1) Is the SAP rating below C75 self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] = ( - self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= + self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]].astype(float) <= self.FILLED_CAVITY_SAP_THRESHOLD ) # 2) Flag anything where the EPC is older than 5 years diff --git a/asset_list/app.py b/asset_list/app.py index 229bf171..34cc9579 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -376,8 +376,6 @@ def app(): epc_data.append(csv_data) epc_df = pd.concat(epc_data) - # TODO: TEMP!!! - epc_df = epc_df.rename(columns={"row_id": asset_list.DOMNA_PROPERTY_ID}) # We expand out the recommendations recommendations_df = epc_df[[asset_list.DOMNA_PROPERTY_ID, "recommendations"]] @@ -445,11 +443,6 @@ def app(): ) asset_list.merge_data(epc_df) - # TODO: TEMP!!! - epc_df["epc_os_uprn"] = epc_df["epc_os_uprn"].astype("Int64").astype(str) - asset_list.standardised_asset_list = asset_list.standardised_asset_list.merge( - epc_df.drop(columns=["domna_property_id"]), how="left", left_on="ordnance_survey_uprn", right_on="epc_os_uprn" - ) asset_list.extract_attributes() From ea1a7b559d7fd3fa1c3f4b54365fe2eeebf0a3b3 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 21 Feb 2025 22:57:56 +0000 Subject: [PATCH 63/72] fixed bug with calling find epc --- asset_list/app.py | 10 +++++----- etl/find_my_epc/RetrieveFindMyEpc.py | 3 ++- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/asset_list/app.py b/asset_list/app.py index 34cc9579..3c1ab627 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -34,6 +34,9 @@ def get_data( "flat": "Flat", "maisonette": "Maisonette", "bungalow": "Bungalow", + "block house": "House", + "coach house": "House", + "bedsit": "Flat" } epc_data = [] @@ -107,7 +110,7 @@ def get_data( searcher.find_property(skip_os=True) # As a final resort, we estimate the EPC - if property_type is not None: + if property_type is not None and searcher.newest_epc is None: searcher.ordnance_survey_client.property_type = property_type searcher.find_property(skip_os=True) @@ -344,11 +347,8 @@ def app(): # We now retrieve any failed properties chunk_failed = chunk[chunk[asset_list.DOMNA_PROPERTY_ID].isin(errors_chunk)] epc_data_failed, _, _ = get_data( - asset_list=chunk_failed, + df=chunk_failed, row_id_name=asset_list.DOMNA_PROPERTY_ID, - fulladdress_column=FULLADDRESS_COLUMN, - address1_column=ADDRESS1_COLUMN, - postcode_column=POSTCODE_COLUMN, manual_uprn_map=MANUAL_UPRN_MAP, epc_api_only=False ) diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index eaba1058..9852cc0d 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -330,7 +330,8 @@ class RetrieveFindMyEpc: "roomstat_programmer_trvs", "time_temperature_zone_control" ], "Replacement warm air unit": [], - "Secondary glazing": ["secondary_glazing"] + "Secondary glazing": ["secondary_glazing"], + "Condensing heating unit": ["boiler_upgrade"], } survey = True From 7b4218299ff1c3b108d3259cecb7fee13f4d1096 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 24 Feb 2025 12:11:47 +0000 Subject: [PATCH 64/72] adding work reasons --- asset_list/AssetList.py | 78 ++++++++++++++++++++++++++++++++++++----- asset_list/app.py | 37 ++++++++++--------- 2 files changed, 91 insertions(+), 24 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 2d224daa..54f6cd96 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -296,7 +296,7 @@ class AssetList: ATTRIBUTE_HEAT_LOSS_AREA = "attribute_heat_loss_area" ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS = "attribute_epc_roof_insulation_thickness" ATTRIBUTE_SAP_THRESHOLD_AND_BELOW = f"sap_rating_{FILLED_CAVITY_SAP_THRESHOLD}_and_below" - ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD = f"EPC is pre {EPC_YEAR_THRESHOLD}" + ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD = f"epc_is_pre_{EPC_YEAR_THRESHOLD}" # These are the descriptions that we look for in the EPC data that are indicative of no insulation EPC_NO_WALL_INSULATION_DESCRIPTIONS = [ @@ -775,7 +775,7 @@ class AssetList: self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], "epc_year_lower_bound": None, "epc_year_upper_bound": None, - "Does Age Match EPC Age Band?": "No EPC Age Band" + "does_age_band_match_epc_age_band": "No EPC Age Band" } ) continue @@ -800,7 +800,7 @@ class AssetList: self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], "epc_year_lower_bound": year_lower_bound, "epc_year_upper_bound": None, - "Does Age Match EPC Age Band?": age_band_matches + "does_age_band_match_epc_age_band": age_band_matches } ) continue @@ -820,7 +820,7 @@ class AssetList: self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], "epc_year_lower_bound": None, "epc_year_upper_bound": 1899, - "Does Age Match EPC Age Band?": age_band_matches + "does_age_band_match_epc_age_band": age_band_matches } ) continue @@ -842,7 +842,7 @@ class AssetList: self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], "epc_year_lower_bound": int(x[self.EPC_API_DATA_NAMES["construction-age-band"]]), "epc_year_upper_bound": int(x[self.EPC_API_DATA_NAMES["construction-age-band"]]), - "Does Age Match EPC Age Band?": age_band_matches + "does_age_band_match_epc_age_band": age_band_matches } ) continue @@ -864,7 +864,7 @@ class AssetList: self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], "epc_year_lower_bound": int(lower_date), "epc_year_upper_bound": int(upper_date), - "Does Age Match EPC Age Band?": age_band_matches + "does_age_band_match_epc_age_band": age_band_matches } ) @@ -892,7 +892,12 @@ class AssetList: (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & (self.standardised_asset_list['non-intrusives: Construction'] == "CAVITY") & self.standardised_asset_list['non-intrusives: Insulated'].isin(["EMPTY", "PARTIAL"]) & - (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2000) + (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2000) & + ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["current-energy-efficiency"] + ] <= self.EMPTY_CAVITY_SAP_THRESHOLD + ) ) self.standardised_asset_list["epc_indicates_empty_cavity"] = ( @@ -1206,6 +1211,11 @@ class AssetList: self.standardised_asset_list["solar_epc_floor_is_other_insulated"] ) + # Drop anything we don't need + self.standardised_asset_list = self.standardised_asset_list.drop( + columns=["walls_u_value", "roof_u_value", "floor_u_value"] + ) + # Produce some aggregate figures self.work_type_figures = { # Empty cavity from non-intrusives @@ -1219,7 +1229,11 @@ class AssetList: ).sum() ), "Cavity Extraction": ( - self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"].sum() + ( + ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] & + ~self.standardised_asset_list["epc_indicates_empty_cavity"] & + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] + ).sum() ), "Solar PV (Solid Floor)": ( self.standardised_asset_list["solar_eligible_solid_floor"].sum() @@ -1234,3 +1248,51 @@ class AssetList: self.standardised_asset_list["solar_eligible_other_floor_needs_loft"].sum() ) } + + # Finally, we note why each property has been flagged + self.standardised_asset_list["cavity_reason"] = None + self.standardised_asset_list["cavity_reason"] = np.where( + self.standardised_asset_list["non_intrusive_indicates_empty_cavity"], + "Non-Intrusive Data Showed Empty Cavity", + self.standardised_asset_list["cavity_reason"] + ) + self.standardised_asset_list["cavity_reason"] = np.where( + ( + self.standardised_asset_list["epc_indicates_empty_cavity"] & + ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] + ), + "EPC Data Showed Empty Cavity", + self.standardised_asset_list["cavity_reason"] + ) + # Flag extraction + self.standardised_asset_list["cavity_reason"] = np.where( + ( + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] & + pd.isnull(self.standardised_asset_list["cavity_reason"]) + ), + "Non-Intrusive Data Showed Cavity Extraction", + self.standardised_asset_list["cavity_reason"] + ) + + # Flag solar + self.standardised_asset_list["solar_reason"] = None + self.standardised_asset_list["solar_reason"] = np.where( + self.standardised_asset_list["solar_eligible_solid_floor"], + "Solid Floor, Insulated, No Solar", + self.standardised_asset_list["solar_reason"] + ) + self.standardised_asset_list["solar_reason"] = np.where( + self.standardised_asset_list["solar_eligible_solid_floor_needs_loft"], + "Solid Floor, Insulated, Needs Loft", + self.standardised_asset_list["solar_reason"] + ) + self.standardised_asset_list["solar_reason"] = np.where( + self.standardised_asset_list["solar_eligible_other_floor"], + "Other Floor, Insulated, No Solar", + self.standardised_asset_list["solar_reason"] + ) + self.standardised_asset_list["solar_reason"] = np.where( + self.standardised_asset_list["solar_eligible_other_floor_needs_loft"], + "Other Floor, Insulated, Needs Loft", + self.standardised_asset_list["solar_reason"] + ) diff --git a/asset_list/app.py b/asset_list/app.py index 3c1ab627..65d4ab87 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -376,6 +376,7 @@ def app(): epc_data.append(csv_data) epc_df = pd.concat(epc_data) + epc_df["estimated"] = epc_df["estimated"].fillna(False) # We expand out the recommendations recommendations_df = epc_df[[asset_list.DOMNA_PROPERTY_ID, "recommendations"]] @@ -454,36 +455,40 @@ def app(): asset_list.identify_worktypes(cleaned) + from pprint import pprint + pprint(asset_list.work_type_figures) + # TODO: We should do this breakdown for flats def flat_analysis(asset_list): # We need to deduce the building name - we strip out the house number - def extract_building_name(x): - # TODO: This doesn't really work - if pd.isnull(x): - return None - house_no = SearchEpc.get_house_number(address=x, postcode=None) - if house_no: - return x.replace(house_no, "").strip() - return x.split(",")[0].strip() # We want to deduce if flats have 50% of the properties below C75 # We group by postcode and property type - grouped = asset_list.groupby([POSTCODE_COLUMN, "Property Type"]) + grouped = asset_list.standardised_asset_list.groupby( + [asset_list.STANDARD_POSTCODE, asset_list.STANDARD_PROPERTY_TYPE] + ) flat_data = [] for _, group in grouped: - if "flat" in group["Property Type"].str.lower().values: - num_flats = group["Property Type"].str.lower().value_counts().get("flat", 0) - num_below_c75 = group["SAP score on register"].lt(75).sum() + if "flat" in group[asset_list.STANDARD_PROPERTY_TYPE].values: + num_flats = group[asset_list.STANDARD_PROPERTY_TYPE].shape[0] + num_below_c75 = group[ + asset_list.EPC_API_DATA_NAMES["current-energy-efficiency"] + ].lt(asset_list.FILLED_CAVITY_SAP_THRESHOLD).sum() + # Check if any flats are below C69 + num_flats_below_c69 = group[ + asset_list.EPC_API_DATA_NAMES["current-energy-efficiency"] + ].lt(69).sum() flat_data.append( { - "Postcode": group[POSTCODE_COLUMN].iloc[0], + "Postcode": group[asset_list.STANDARD_POSTCODE].iloc[0], "Property Type": "Flat", "Number of Flats with EPC": num_flats, "Number of Flats below C75": num_below_c75, - "Proportion of Flat EPCs below C75": round(100 * num_below_c75 / num_flats) + "Proportion of Flat EPCs below C75": round(100 * num_below_c75 / num_flats), + "num_flats_below_c69": num_flats_below_c69, } ) @@ -494,11 +499,11 @@ def app(): flat_data = flat_analysis(asset_list) # Store as an excel - filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull.xlsx" + filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " - Standardised.xlsx" # Store the data in two tabs. One for the asset list with the EPC data and the second with the flat data with pd.ExcelWriter(filename) as writer: - asset_list.to_excel(writer, sheet_name="EPC Data", index=False) + asset_list.standardised_asset_list.to_excel(writer, sheet_name="Standardised Asset List", index=False) flat_data.to_excel(writer, sheet_name="Flat Data", index=False) matches_review = asset_list[ From 99a0948e2bd3ab14197821a694cbf1d2383baff3 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 24 Feb 2025 16:11:02 +0000 Subject: [PATCH 65/72] getting ready to work on the colchester data --- asset_list/AssetList.py | 82 ++++++++++++++++++++++++++++++++-------- asset_list/app.py | 83 ++++++----------------------------------- 2 files changed, 78 insertions(+), 87 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 54f6cd96..2b80287c 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -343,6 +343,7 @@ class AssetList: self.standardised_asset_list = self.raw_asset_list.copy() # Will be used to store aggregated figures against the various work types self.work_type_figures = {} + self.flat_data = None # We detect the presence of the non-intrusive columns self.non_intrusives_present = True if "CIGA Check Required" in self.raw_asset_list.columns else False @@ -649,6 +650,9 @@ class AssetList: logger.info("Applying standardisation to asset list") for variable, mapping in self.variable_mappings.items(): + self.standardised_asset_list[variable + "_original_from_landlord"] = ( + self.standardised_asset_list[variable].copy() + ) self.standardised_asset_list[variable] = self.standardised_asset_list[variable].map(mapping) if self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum(): @@ -663,6 +667,12 @@ class AssetList: # Apply renames to our standard names # Perform final variable selection and renaming: + + # We add the original columns to the keep variables + self.keep_variables += [ + k + "_original_from_landlord" for k in self.variable_mappings.keys() + ] + self.standardised_asset_list = self.standardised_asset_list[self.keep_variables].rename( columns=self.rename_map ) @@ -912,18 +922,6 @@ class AssetList: self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= self.EMPTY_CAVITY_SAP_THRESHOLD ) ) - - self.standardised_asset_list["empty_cavity"] = ( - self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] | - self.standardised_asset_list["epc_indicates_empty_cavity"] - ) - # We add a reason - self.standardised_asset_list["empty_cavity_reason"] = np.where( - self.standardised_asset_list["non_intrusive_indicates_empty_cavity"], - "Non-Intrusive Data", - "EPC Data" - ) - ###################################################### # Extraction ###################################################### @@ -933,7 +931,7 @@ class AssetList: self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = ( (self.standardised_asset_list["non-intrusives: Construction"] == "CAVITY") & (self.standardised_asset_list["non-intrusives: Insulated"].isin(["RETRO DRILLED", "FILLED AT BUILD"])) & - (~self.standardised_asset_list['non-intrusives: Material'].isin(["GREY LOOSE BEAD", "FORMALDEHYDE"]) + (~self.standardised_asset_list['non-intrusives: Material'].isin(["GREY LOOSE BEAD", "COMPACTED BEAD"]) ) & ( self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] ) @@ -996,6 +994,12 @@ class AssetList: ) ) + self.standardised_asset_list["solar_non_intrusives_walls_insulated"] = ( + self.standardised_asset_list["non-intrusives: Insulated"].isin( + ["EWI", "RETRO DRILLED", "FILLED AT BUILD"] + ) + ) + # TODO: We don't have information about the roof from this landlord # We merge on the u-value for average thermal transmittance @@ -1146,7 +1150,8 @@ class AssetList: # The walls are insulated ( self.standardised_asset_list["solar_landlord_walls_insulated"] | - self.standardised_asset_list["solar_epc_walls_insulated"] + self.standardised_asset_list["solar_epc_walls_insulated"] | + self.standardised_asset_list["solar_non_intrusives_walls_insulated"] ) & # Roof is insulated self.standardised_asset_list["solar_epc_roof_insulated"] & @@ -1165,7 +1170,8 @@ class AssetList: # The walls are insulated ( self.standardised_asset_list["solar_landlord_walls_insulated"] | - self.standardised_asset_list["solar_epc_walls_insulated"] + self.standardised_asset_list["solar_epc_walls_insulated"] | + self.standardised_asset_list["solar_non_intrusives_walls_insulated"] ) & # Roof is insulated self.standardised_asset_list["solar_epc_loft_needs_topup"] & @@ -1216,6 +1222,15 @@ class AssetList: columns=["walls_u_value", "roof_u_value", "floor_u_value"] ) + # Adjust flagged extraction jobs to remove anything for solar + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = ( + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] & + ~self.standardised_asset_list["solar_eligible_solid_floor"] & + ~self.standardised_asset_list["solar_eligible_solid_floor_needs_loft"] + # ~self.standardised_asset_list["solar_eligible_other_floor"] & + # ~self.standardised_asset_list["solar_eligible_other_floor_needs_loft"] + ) + # Produce some aggregate figures self.work_type_figures = { # Empty cavity from non-intrusives @@ -1296,3 +1311,40 @@ class AssetList: "Other Floor, Insulated, Needs Loft", self.standardised_asset_list["solar_reason"] ) + + def flat_analysis(self): + + # We need to deduce the building name - we strip out the house number + + # We want to deduce if flats have 50% of the properties below C75 + # We group by postcode and property type + grouped = self.standardised_asset_list.groupby( + [self.STANDARD_POSTCODE, self.STANDARD_PROPERTY_TYPE] + ) + + flat_data = [] + for _, group in grouped: + if "flat" in group[self.STANDARD_PROPERTY_TYPE].values: + num_flats = group[self.STANDARD_PROPERTY_TYPE].shape[0] + num_below_c75 = group[ + self.EPC_API_DATA_NAMES["current-energy-efficiency"] + ].lt(self.FILLED_CAVITY_SAP_THRESHOLD).sum() + # Check if any flats are below C69 + num_flats_below_c69 = group[ + self.EPC_API_DATA_NAMES["current-energy-efficiency"] + ].lt(69).sum() + + flat_data.append( + { + "Postcode": group[self.STANDARD_POSTCODE].iloc[0], + "Property Type": "Flat", + "Number of Flats with EPC": num_flats, + "Number of Flats below C75": num_below_c75, + "Proportion of Flat EPCs below C75": round(100 * num_below_c75 / num_flats), + "Number of Flats Below C69": num_flats_below_c69, + } + ) + + flat_data = pd.DataFrame(flat_data) + + self.flat_data = flat_data diff --git a/asset_list/app.py b/asset_list/app.py index 65d4ab87..f164e94e 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -4,6 +4,7 @@ import json import pandas as pd import numpy as np from tqdm import tqdm +from pprint import pprint import msgpack from utils.s3 import read_from_s3 from asset_list.AssetList import AssetList @@ -239,23 +240,18 @@ def app(): # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) # - Or the insulation required is loft/cavity (floors should be solid) - # For Westward - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward" - DATA_FILENAME = "WESTWARD - completed list..xlsx" + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" + DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx" SHEET_NAME = "Sheet1" - - POSTCODE_COLUMN = "WFT EDIT Postcode" - FULLADDRESS_COLUMN = "Address" + POSTCODE_COLUMN = 'Full Address.1' + FULLADDRESS_COLUMN = "Full Address" ADDRESS1_COLUMN = None - ADDRESS1_METHOD = "house_number_extraction" - + ADDRESS1_METHOD = "first_word" ADDRESS_COLS_TO_CONCAT = [] MISSING_POSTCODES_METHOD = None - PROPERTY_YEAR_BUILT = "Build date" - UPRN_COLUMN = "UPRN" - # If we have the non-intrusives data, this should be true - HAS_NON_INTRUSIVES = True - PROPERTY_TYPE_COLUMN = "Location type" # This will be used to identify and remove bedsits + PROPERTY_YEAR_BUILT = "Build Date" + UPRN_COLUMN = None + PROPERTY_TYPE_COLUMN = None # Maps addresses to uprn in problematic cases MANUAL_UPRN_MAP = {} @@ -297,20 +293,6 @@ def app(): asset_list.apply_standardiation() - # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" - # DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx" - # SHEET_NAME = "Sheet1" - # POSTCODE_COLUMN = 'Full Address.1' - # FULLADDRESS_COLUMN = "Full Address" - # ADDRESS1_COLUMN = None - # ADDRESS1_METHOD = "first_word" - # ADDRESS_COLS_TO_CONCAT = [] - # MISSING_POSTCODES_METHOD = None - # PROPERTY_YEAR_BUILT = "Build Date" - # UPRN_COLUMN = None - # # If we have the non-intrusives data, this should be true - # HAS_NON_INTRUSIVES = True - ### We retrieve the EPC data # We chunk up this data into 5000 rows at a time @@ -455,48 +437,9 @@ def app(): asset_list.identify_worktypes(cleaned) - from pprint import pprint pprint(asset_list.work_type_figures) - # TODO: We should do this breakdown for flats - def flat_analysis(asset_list): - - # We need to deduce the building name - we strip out the house number - - # We want to deduce if flats have 50% of the properties below C75 - # We group by postcode and property type - grouped = asset_list.standardised_asset_list.groupby( - [asset_list.STANDARD_POSTCODE, asset_list.STANDARD_PROPERTY_TYPE] - ) - - flat_data = [] - for _, group in grouped: - if "flat" in group[asset_list.STANDARD_PROPERTY_TYPE].values: - num_flats = group[asset_list.STANDARD_PROPERTY_TYPE].shape[0] - num_below_c75 = group[ - asset_list.EPC_API_DATA_NAMES["current-energy-efficiency"] - ].lt(asset_list.FILLED_CAVITY_SAP_THRESHOLD).sum() - # Check if any flats are below C69 - num_flats_below_c69 = group[ - asset_list.EPC_API_DATA_NAMES["current-energy-efficiency"] - ].lt(69).sum() - - flat_data.append( - { - "Postcode": group[asset_list.STANDARD_POSTCODE].iloc[0], - "Property Type": "Flat", - "Number of Flats with EPC": num_flats, - "Number of Flats below C75": num_below_c75, - "Proportion of Flat EPCs below C75": round(100 * num_below_c75 / num_flats), - "num_flats_below_c69": num_flats_below_c69, - } - ) - - flat_data = pd.DataFrame(flat_data) - - return flat_data - - flat_data = flat_analysis(asset_list) + asset_list.flat_analysis() # Store as an excel filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " - Standardised.xlsx" @@ -504,8 +447,4 @@ def app(): with pd.ExcelWriter(filename) as writer: asset_list.standardised_asset_list.to_excel(writer, sheet_name="Standardised Asset List", index=False) - flat_data.to_excel(writer, sheet_name="Flat Data", index=False) - - matches_review = asset_list[ - [FULLADDRESS_COLUMN, ADDRESS1_COLUMN, POSTCODE_COLUMN, "Address on EPC", "Postcode on EPC"] - ] + asset_list.flat_data.to_excel(writer, sheet_name="Flat Data", index=False) From 5391afeaaaa024ff7b1a54fc18f565b9c46a3925 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 24 Feb 2025 16:52:42 +0000 Subject: [PATCH 66/72] handling the case of landlord property id being missing --- asset_list/AssetList.py | 2 +- asset_list/app.py | 58 ++++++++++++++++++++++------------------- 2 files changed, 32 insertions(+), 28 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 2b80287c..c2784eb1 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -378,7 +378,7 @@ class AssetList: self.keep_variables = [] # Finally, we handle the case where the landlord's property ID is actually the OS UPRN - if self.landlord_uprn == self.landlord_property_id: + if (self.landlord_uprn == self.landlord_property_id) and (self.landlord_property_id is not None): self.standardised_asset_list[self.STANDARD_UPRN] = self.standardised_asset_list[self.landlord_uprn].copy() # Update the reference to landlord UPRn self.landlord_uprn = self.STANDARD_UPRN diff --git a/asset_list/app.py b/asset_list/app.py index f164e94e..89b15c06 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -240,39 +240,43 @@ def app(): # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) # - Or the insulation required is loft/cavity (floors should be solid) - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" - DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx" - SHEET_NAME = "Sheet1" - POSTCODE_COLUMN = 'Full Address.1' - FULLADDRESS_COLUMN = "Full Address" - ADDRESS1_COLUMN = None - ADDRESS1_METHOD = "first_word" - ADDRESS_COLS_TO_CONCAT = [] - MISSING_POSTCODES_METHOD = None - PROPERTY_YEAR_BUILT = "Build Date" - UPRN_COLUMN = None - PROPERTY_TYPE_COLUMN = None + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" + data_filename = "Warmfront data- Colchester Borough Homes (Complete).xlsx" + sheet_name = "Sheet1" + postcode_column = 'Full Address.1' + fulladdress_column = "Full Address" + address1_column = None + address1_method = "first_word" + address_cols_to_concat = [] + missing_postcodes_method = None + landlord_year_built = "Build Date" + landlord_os_uprn = None + landlord_property_type = "Property Type" + landlord_wall_construction = "Wallinsul" + landlord_heating_system = "HeatSorc" + landlord_existing_pv = None + landlord_property_id = None # Maps addresses to uprn in problematic cases MANUAL_UPRN_MAP = {} asset_list = AssetList( - local_filepath=os.path.join(DATA_FOLDER, DATA_FILENAME), + local_filepath=os.path.join(data_folder, data_filename), header=0, - sheet_name=SHEET_NAME, - address1_colname=ADDRESS1_COLUMN, - postcode_colname=POSTCODE_COLUMN, - landlord_property_id="UPRN", - full_address_colname=FULLADDRESS_COLUMN, - full_address_cols_to_concat=ADDRESS_COLS_TO_CONCAT, - missing_postcodes_method=MISSING_POSTCODES_METHOD, - address1_extraction_method=ADDRESS1_METHOD, - landlord_year_built=PROPERTY_YEAR_BUILT, - landlord_uprn=UPRN_COLUMN, - landlord_property_type=PROPERTY_TYPE_COLUMN, - landlord_wall_construction="Wall Construction (EPC)", - landlord_heating_system="Heat Source", - landlord_existing_pv="PV (Y/N)" + sheet_name=sheet_name, + address1_colname=address1_column, + postcode_colname=postcode_column, + landlord_property_id=landlord_property_id, + full_address_colname=fulladdress_column, + full_address_cols_to_concat=address_cols_to_concat, + missing_postcodes_method=missing_postcodes_method, + address1_extraction_method=address1_method, + landlord_year_built=landlord_year_built, + landlord_uprn=landlord_os_uprn, + landlord_property_type=landlord_property_type, + landlord_wall_construction=landlord_wall_construction, + landlord_heating_system=landlord_heating_system, + landlord_existing_pv=landlord_existing_pv ) asset_list.init_standardise() From 8fa8307e33dc27793815eccadbb11fa3a28d1c68 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 24 Feb 2025 18:36:00 +0000 Subject: [PATCH 67/72] ai mappings --- asset_list/AssetList.py | 32 ++++++++++++++++++++++- asset_list/app.py | 2 +- asset_list/mappings/heating_systems.py | 35 ++++++++++++++++++++------ asset_list/mappings/property_type.py | 9 ++++++- asset_list/mappings/walls.py | 13 +++++++--- 5 files changed, 77 insertions(+), 14 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index c2784eb1..06ec5907 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -598,7 +598,35 @@ class AssetList: self.standardised_asset_list[self.landlord_year_built].dt.year ) else: - raise NotImplementedError("Year built column must be a datetime - implement me") + # We attempt to convert the year built to a datetime, by detecting the format and converting + + def extract_year(date_str): + """ + Extracts the year from a date string in the format '01-Jul-YYYY'. + Returns the extracted year as an integer or None if the format is incorrect. + """ + known_errors = ["#MULTIVALUE"] + + if pd.isnull(date_str) or date_str in known_errors: + return None + + if isinstance(date_str, str): + match = re.match(r"\d{1,2}-[A-Za-z]{3}-(\d{4})", date_str) + if match: + return int(match.group(1)) # Extract the year and convert to integer + + if isinstance(date_str, datetime): + return date_str.year + + # Check if date_str is a year itself + if str(date_str).isdigit() & (len(str(date_str)) == 4): + return int(date_str) + + raise NotImplementedError("Unhandled format for year built - implement me") + + self.standardised_asset_list[self.landlord_year_built] = self.standardised_asset_list[ + self.landlord_year_built + ].apply(extract_year) # We now create standard lookups to_remap = { @@ -619,6 +647,8 @@ class AssetList: "standard_map": existing_pv_mappings.EXISTING_PV_MAPPINGS } } + # Keep just entries where the key is not None + to_remap = {k: v for k, v in to_remap.items() if k is not None} for variable, config in to_remap.items(): logger.info("Standardising variable: %s", variable) diff --git a/asset_list/app.py b/asset_list/app.py index 89b15c06..1cb7808e 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -255,7 +255,7 @@ def app(): landlord_wall_construction = "Wallinsul" landlord_heating_system = "HeatSorc" landlord_existing_pv = None - landlord_property_id = None + landlord_property_id = "Property Reference" # Maps addresses to uprn in problematic cases MANUAL_UPRN_MAP = {} diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py index 89bfe0c4..b58f13f2 100644 --- a/asset_list/mappings/heating_systems.py +++ b/asset_list/mappings/heating_systems.py @@ -1,3 +1,5 @@ +import numpy as np + STANDARD_HEATING_SYSTEMS = { "gas combi boiler", "electric storage heaters", @@ -35,12 +37,31 @@ HEATING_MAPPINGS = { "Eco Electric Radiators": "electric radiators", "Gas fire": "other", "Backboiler - Solid fuel": "other", - 'combi - gas': 'gas combi boiler', 'e7 storage heaters': 'electric storage heaters', - 'district heating system': 'district heating', 'condensing boiler - gas': 'gas condensing boiler', - 'boiler oil/other': 'oil boiler', 'condensing combi - gas': 'gas condensing combi', - 'air source source heat pump': 'air source heat pump', 'biomass boiler': 'boiler - other fuel', - 'ground source heat pump': 'ground source heat pump', 'electric oil filled radiators': 'electric radiators', - 'solid fuel': 'other', 'lpg boiler': 'boiler - other fuel', 'electric boiler': 'electric boiler', + 'combi - gas': 'gas combi boiler', + 'e7 storage heaters': 'electric storage heaters', + 'district heating system': 'district heating', + 'condensing boiler - gas': 'gas condensing boiler', + 'boiler oil/other': 'oil boiler', + 'condensing combi - gas': 'gas condensing combi', + 'air source source heat pump': 'air source heat pump', + 'biomass boiler': 'boiler - other fuel', + 'ground source heat pump': 'ground source heat pump', + 'electric oil filled radiators': 'electric radiators', + 'solid fuel': 'other', + 'lpg boiler': 'boiler - other fuel', + 'electric boiler': 'electric boiler', 'no data': 'unknown', 'boiler communal/commercial - gas': 'communal gas boiler', - 'eco electric radiators': 'electric radiators', 'gas fire': 'other', 'backboiler - solid fuel': 'other', + 'eco electric radiators': 'electric radiators', + 'gas fire': 'other', 'backboiler - solid fuel': 'other', + 'ASHP': 'air source heat pump', + 'COMMHEAT': 'communal gas boiler', + 'GBB': 'gas combi boiler', + 'GFS': 'gas condensing boiler', + 'GWA': 'gas condensing boiler', + 'GWM': 'gas condensing combi', + 'HDU': 'district heating', + 'OILBLR': 'oil boiler', + 'SOLIDFUEL': 'boiler - other fuel', + 'STORHTR': 'high heat retention storage heaters', + np.nan: 'unknown', } diff --git a/asset_list/mappings/property_type.py b/asset_list/mappings/property_type.py index ec569123..2612f058 100644 --- a/asset_list/mappings/property_type.py +++ b/asset_list/mappings/property_type.py @@ -1,7 +1,7 @@ # These are the standard categories for property types STANDARD_PROPERTY_TYPES = { "house", "flat", "maisonette", "bungalow", "park home", "block house", "bedsit", "coach house", - "unknown", "other" + "unknown", "other", "block of flats" } # This is a basic mapping that we use to map values that we've seen commonly to standard values @@ -15,4 +15,11 @@ PROPERTY_MAPPING = { "BEDSIT": "bedsit", "COACHSE": "coach house", "coachse": "coach house", + 'Admin Unit Type': 'unknown', + 'Block': 'block of flats', + 'Bungalow': 'bungalow', + 'Flat': 'flat', + 'House': 'house', + 'Maisonette': 'maisonette', + 'Stairwell': 'other' } diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py index 1fc52fcb..82b31d01 100644 --- a/asset_list/mappings/walls.py +++ b/asset_list/mappings/walls.py @@ -1,7 +1,8 @@ STANDARD_WALL_CONSTRUCTIONS = { "uninsulated cavity", "filled cavity", "partial insulated cavity", "cavity unknown insulation", - "timber frame", "uninsulated solid brick", - "insulated solid brick", "system built", "granite or whinstone", "other", "unknown", "sandstone or limestone", + "uninsulated solid brick", "insulated solid brick", "solid brick unknown insulation", + "timber frame", + "system built", "granite or whinstone", "other", "unknown", "sandstone or limestone", "cob", "new build - average thermal transmittance", } @@ -70,8 +71,7 @@ WALL_CONSTRUCTION_MAPPINGS = { 'average thermal transmittance 0.28 w/m?k': 'unknown', 'Cavity wall, filled cavity': 'filled cavity', 'Cavity wall, filled cavity and external insulation': 'filled cavity', - 'Granite or whinstone, as built, no insulation (assumed)': 'granite or ' - 'whinstone', + 'Granite or whinstone, as built, no insulation (assumed)': 'granite or whinstone', 'Solid brick, as built, insulated (assumed)': 'insulated solid brick', 'Solid brick, as built, no insulation (assumed)': 'uninsulated solid brick', 'Solid brick, with external insulation': 'insulated solid brick', @@ -84,4 +84,9 @@ WALL_CONSTRUCTION_MAPPINGS = { 'Timber frame, as built, no insulation (assumed)': 'timber frame', 'Timber frame, as built, partial insulation (assumed)': 'timber frame', 'Timber frame, with additional insulation': 'timber frame', + 'CAVITY': 'partial unknown cavity', + 'COMB': 'unknown', + 'NONE': 'unknown', + 'NOTKNOWN': 'unknown', + 'SOLID': 'solid brick unknown insulation', } From c3049732f0d680a38aa9acacd3f15ff9e16d80f0 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 24 Feb 2025 18:44:06 +0000 Subject: [PATCH 68/72] handling block of flats --- asset_list/AssetList.py | 7 +++++++ asset_list/app.py | 25 ++++++++++++++++--------- asset_list/mappings/heating_systems.py | 2 +- 3 files changed, 24 insertions(+), 10 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 06ec5907..72086c60 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -344,6 +344,7 @@ class AssetList: # Will be used to store aggregated figures against the various work types self.work_type_figures = {} self.flat_data = None + self.duplicated_addresses = None # We detect the presence of the non-intrusive columns self.non_intrusives_present = True if "CIGA Check Required" in self.raw_asset_list.columns else False @@ -691,6 +692,12 @@ class AssetList: f"There are {self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum()} duplicated " f"addresses - dropping" ) + + # Keep a record of duplicates + self.duplicated_addresses = self.standardised_asset_list[ + self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated() + ][[self.DOMNA_PROPERTY_ID, self.address1_colname, self.postcode_colname]].copy() + self.standardised_asset_list = self.standardised_asset_list[ ~self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated() ] diff --git a/asset_list/app.py b/asset_list/app.py index 1cb7808e..a24c4043 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -45,6 +45,12 @@ def get_data( no_epc = [] for _, home in tqdm(df.iterrows(), total=len(df)): try: + + # If we have a block of flats, we cannot retrieve this data + if home[AssetList.STANDARD_PROPERTY_TYPE] == "block of flats": + no_epc.append(home[row_id_name]) + continue + postcode = home[postcode_column] house_number = str(home[address1_column]).strip() full_address = home[fulladdress_column].strip() @@ -283,16 +289,17 @@ def app(): # We produce the new maps, which can be saved for future useage new_property_type_map = PROPERTY_MAPPING.copy().update( - asset_list.variable_mappings[asset_list.landlord_property_type] + asset_list.variable_mappings[asset_list.landlord_property_type] if asset_list.landlord_property_type else {} ) new_wall_map = WALL_CONSTRUCTION_MAPPINGS.copy().update( - asset_list.variable_mappings[asset_list.landlord_wall_construction] + asset_list.variable_mappings[asset_list.landlord_wall_construction] if + asset_list.landlord_wall_construction else {} ) new_heating_map = HEATING_MAPPINGS.copy().update( - asset_list.variable_mappings[asset_list.landlord_heating_system] + asset_list.variable_mappings[asset_list.landlord_heating_system] if asset_list.landlord_heating_system else {} ) new_existing_pv_map = EXISTING_PV_MAPPINGS.copy().update( - asset_list.variable_mappings[asset_list.landlord_existing_pv] + asset_list.variable_mappings[asset_list.landlord_existing_pv] if asset_list.landlord_existing_pv else {} ) asset_list.apply_standardiation() @@ -305,7 +312,7 @@ def app(): skip = None # Used to skip already completed chunks chunk_size = 5000 filename = "Chunk {i}.csv" - download_folder = os.path.join(DATA_FOLDER, "Chunks") + download_folder = os.path.join(data_folder, "Chunks") if not os.path.exists(download_folder): os.makedirs(download_folder) @@ -343,12 +350,12 @@ def app(): # Append the failed data to the main data # Store the chunk locally as a csv - pd.DataFrame(epc_data_chunk).to_csv(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i}.csv"), index=False) + pd.DataFrame(epc_data_chunk).to_csv(os.path.join(data_folder, f"Chunks/Chunk {i}.csv"), index=False) # Store the errors and no-data locally - with open(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i} errors.json"), "w") as f: + with open(os.path.join(data_folder, f"Chunks/Chunk {i} errors.json"), "w") as f: json.dump(errors_chunk, f) - with open(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i} nodata.csv"), "w") as f: + with open(os.path.join(data_folder, f"Chunks/Chunk {i} nodata.csv"), "w") as f: json.dump(no_epc_chunk, f) # We read in and concatenate the created created chunks @@ -446,7 +453,7 @@ def app(): asset_list.flat_analysis() # Store as an excel - filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " - Standardised.xlsx" + filename = os.path.join(data_folder, ".".join(data_filename.split(".")[:-1])) + " - Standardised.xlsx" # Store the data in two tabs. One for the asset list with the EPC data and the second with the flat data with pd.ExcelWriter(filename) as writer: diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py index b58f13f2..4879efcc 100644 --- a/asset_list/mappings/heating_systems.py +++ b/asset_list/mappings/heating_systems.py @@ -62,6 +62,6 @@ HEATING_MAPPINGS = { 'HDU': 'district heating', 'OILBLR': 'oil boiler', 'SOLIDFUEL': 'boiler - other fuel', - 'STORHTR': 'high heat retention storage heaters', + 'STORHTR': 'electric storage heaters', np.nan: 'unknown', } From 0ffc59861c4d70a822c0830838bc740a2598331f Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 25 Feb 2025 08:19:08 +0000 Subject: [PATCH 69/72] examining results on colchester --- asset_list/AssetList.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 72086c60..0156a2a3 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -233,7 +233,8 @@ class AssetList: "secondheat-description": "epc_secondary_heating", "transaction-type": "epc_reason", "energy-consumption-current": "epc_heat_demand", - "photo-supply": "epc_photo_supply" + "photo-supply": "epc_photo_supply", + "estimated": "estimated" } FIND_EPC_DATA_NAMES = { "heating_text": "epc_estiamted_heating_kwh", @@ -714,6 +715,22 @@ class AssetList: columns=self.rename_map ) + # We fill any standard columns that are not in the data because they were not provided by the landlord + missing_variables = [ + v for v in [ + self.STANDARD_EXISTING_PV, + self.STANDARD_HEATING_SYSTEM, + self.STANDARD_UPRN, + self.STANDARD_PROPERTY_TYPE, + self.STANDARD_YEAR_BUILT, + self.STANDARD_WALL_CONSTRUCTION, + self.STANDARD_HEATING_SYSTEM, + self.STANDARD_EXISTING_PV + ] if v not in self.standardised_asset_list.columns + ] + for v in missing_variables: + self.standardised_asset_list[v] = None + def merge_data(self, df: pd.DataFrame): """ Used to insert data into the standardised asset list, based on the domna property id @@ -963,7 +980,6 @@ class AssetList: # Extraction ###################################################### - # TODO When filterting like this, 627 properties are flagged as not needing a CIGA check and 582 are flagged # as needing a CIGA check. What is the logic we should be applying here? self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = ( (self.standardised_asset_list["non-intrusives: Construction"] == "CAVITY") & @@ -974,6 +990,15 @@ class AssetList: ) ) + z = self.standardised_asset_list[ + self.standardised_asset_list["non-intrusives: CIGA Check Required"] == "YES" + ] + z["non-intrusives: Insulated"].value_counts() + z["non-intrusives: Material"].value_counts() + z[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW].value_counts() + z[self.EPC_API_DATA_NAMES["current-energy-efficiency"]].max() + zz = z[z[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] == 105] + ###################################################### # Solar ###################################################### From 67f3e8ab703ea2893cdb9f9a6a9bd7bbee9344f8 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 25 Feb 2025 08:41:08 +0000 Subject: [PATCH 70/72] reviewing methodology --- asset_list/AssetList.py | 51 +++++++++++++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 9 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 0156a2a3..76f2b145 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -951,7 +951,7 @@ class AssetList: ###################################################### # 1) Has been flagged on the non-intrusives as being a cavity wall, empty or partially filled # 2) The age is before 1995 - # TODO: 3) Remove anything that likley has access issues + # 3) We don't remove anything that haas access issues yet self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] = ( (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & (self.standardised_asset_list['non-intrusives: Construction'] == "CAVITY") & @@ -976,6 +976,19 @@ class AssetList: self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= self.EMPTY_CAVITY_SAP_THRESHOLD ) ) + + z0 = self.standardised_asset_list[ + self.standardised_asset_list["epc_indicates_empty_cavity"] & ( + ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] + ) + ] + z0['non-intrusives: Construction'].value_counts() + z0['non-intrusives: Insulated'].value_counts() + z00 = z0[z0['non-intrusives: Insulated'] == "EWI"] + + # If the EPC is estimated, perhaps we should defer to the non-intrusives? + z00[""] + ###################################################### # Extraction ###################################################### @@ -990,14 +1003,26 @@ class AssetList: ) ) - z = self.standardised_asset_list[ - self.standardised_asset_list["non-intrusives: CIGA Check Required"] == "YES" - ] - z["non-intrusives: Insulated"].value_counts() - z["non-intrusives: Material"].value_counts() - z[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW].value_counts() - z[self.EPC_API_DATA_NAMES["current-energy-efficiency"]].max() - zz = z[z[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] == 105] + # z3 = self.standardised_asset_list[ + # self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] + # ] + # z3['non-intrusives: Material'].value_counts() + # self.standardised_asset_list['non-intrusives: Material'].value_counts() + # + # z = self.standardised_asset_list[ + # self.standardised_asset_list["non-intrusives: CIGA Check Required"] == "YES" + # ] + # z["non-intrusives: Insulated"].value_counts() + # z["non-intrusives: Material"].value_counts() + # z[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW].value_counts() + # z[self.EPC_API_DATA_NAMES["current-energy-efficiency"]].max() + # z[self.EPC_API_DATA_NAMES["current-energy-efficiency"]].min() + # z[self.STANDARD_YEAR_BUILT].describe() + # + # zz = z[z[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] == 105] + # z2 = self.standardised_asset_list[ + # self.standardised_asset_list["non-intrusives: CIGA Check Required"] == "NO" + # ] ###################################################### # Solar @@ -1159,6 +1184,10 @@ class AssetList: .lower().str.contains("solid") ) & ( ~self.standardised_asset_list["epc_has_floor_recommendation"] + ) & ( + # We do not utilise estimated EPCs for this method because we will always find that + # "epc_has_floor_recommendation" is False + ~self.standardised_asset_list["estimated"] ) ) | ( ( @@ -1180,6 +1209,10 @@ class AssetList: .lower().str.contains("suspended") ) & ( ~self.standardised_asset_list["epc_has_floor_recommendation"] + ) & ( + # We do not utilise estimated EPCs for this method because we will always find that + # "epc_has_floor_recommendation" is False + ~self.standardised_asset_list["estimated"] ) ) | ( ( From ddfbf33494f6741b974217fffc5bb4ba784560a0 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 26 Feb 2025 11:00:12 +0000 Subject: [PATCH 71/72] westward complete --- asset_list/AssetList.py | 95 ++++++++++++++----------- asset_list/app.py | 42 +++++++---- asset_list/mappings/walls.py | 2 +- etl/customers/remote_assessments/app.py | 14 ++-- recommendations/HeatingRecommender.py | 2 +- 5 files changed, 94 insertions(+), 61 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 76f2b145..31b11c66 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -286,7 +286,7 @@ class AssetList: # This SAP threshold is a key search criteria for properties that may be eligible for extraction FILLED_CAVITY_SAP_THRESHOLD = 75 # This SAP the - EMPTY_CAVITY_SAP_THRESHOLD = 71 + EMPTY_CAVITY_SAP_THRESHOLD = 75 # Any EPC deemed to have been conducted prior to this year is deemed to be unreliable EPC_YEAR_THRESHOLD = pd.Timestamp.now().year - 5 @@ -956,13 +956,28 @@ class AssetList: (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & (self.standardised_asset_list['non-intrusives: Construction'] == "CAVITY") & self.standardised_asset_list['non-intrusives: Insulated'].isin(["EMPTY", "PARTIAL"]) & - (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2000) & + (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2002) & ( self.standardised_asset_list[ self.EPC_API_DATA_NAMES["current-energy-efficiency"] ] <= self.EMPTY_CAVITY_SAP_THRESHOLD ) ) + # Let's also flag work that looks eligible without the SAP filter + self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] = ( + (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & + (self.standardised_asset_list['non-intrusives: Construction'] == "CAVITY") & + self.standardised_asset_list['non-intrusives: Insulated'].isin(["EMPTY", "PARTIAL"]) & + (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2002) + ) + + # If non_intrusive_indicates_empty_cavity is True, + # set non_intrusive_indicates_empty_cavity_no_sap_filter to False + self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] = np.where( + self.standardised_asset_list["non_intrusive_indicates_empty_cavity"], + False, + self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] + ) self.standardised_asset_list["epc_indicates_empty_cavity"] = ( self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]].str.lower().isin( @@ -977,17 +992,16 @@ class AssetList: ) ) - z0 = self.standardised_asset_list[ - self.standardised_asset_list["epc_indicates_empty_cavity"] & ( - ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] - ) - ] - z0['non-intrusives: Construction'].value_counts() - z0['non-intrusives: Insulated'].value_counts() - z00 = z0[z0['non-intrusives: Insulated'] == "EWI"] - - # If the EPC is estimated, perhaps we should defer to the non-intrusives? - z00[""] + # If the EPC is esimtated, we defer to the non-intrusives + self.standardised_asset_list["epc_indicates_empty_cavity"] = np.where( + ( + self.standardised_asset_list["epc_indicates_empty_cavity"] & + ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] & + self.standardised_asset_list["estimated"] + ), + False, + self.standardised_asset_list["epc_indicates_empty_cavity"] + ) ###################################################### # Extraction @@ -997,33 +1011,14 @@ class AssetList: self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = ( (self.standardised_asset_list["non-intrusives: Construction"] == "CAVITY") & (self.standardised_asset_list["non-intrusives: Insulated"].isin(["RETRO DRILLED", "FILLED AT BUILD"])) & - (~self.standardised_asset_list['non-intrusives: Material'].isin(["GREY LOOSE BEAD", "COMPACTED BEAD"]) + (~self.standardised_asset_list['non-intrusives: Material'].isin( + ["GREY LOOSE BEAD", "COMPACTED BEAD", "FIBRE BATT NO CAVITY", "EMPTY NARROW BELOW 30mm"] + ) ) & ( self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] ) ) - # z3 = self.standardised_asset_list[ - # self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] - # ] - # z3['non-intrusives: Material'].value_counts() - # self.standardised_asset_list['non-intrusives: Material'].value_counts() - # - # z = self.standardised_asset_list[ - # self.standardised_asset_list["non-intrusives: CIGA Check Required"] == "YES" - # ] - # z["non-intrusives: Insulated"].value_counts() - # z["non-intrusives: Material"].value_counts() - # z[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW].value_counts() - # z[self.EPC_API_DATA_NAMES["current-energy-efficiency"]].max() - # z[self.EPC_API_DATA_NAMES["current-energy-efficiency"]].min() - # z[self.STANDARD_YEAR_BUILT].describe() - # - # zz = z[z[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] == 105] - # z2 = self.standardised_asset_list[ - # self.standardised_asset_list["non-intrusives: CIGA Check Required"] == "NO" - # ] - ###################################################### # Solar ###################################################### @@ -1114,7 +1109,7 @@ class AssetList: ) | ( self.standardised_asset_list[ "walls_u_value"].apply( - lambda x: x <= 0.3 if not pd.isnull( + lambda x: x <= 0.7 if not pd.isnull( x) else False ) ) @@ -1141,7 +1136,7 @@ class AssetList: "|".join(self.EPC_INSULATED_ROOF_SUBSTRINGS), regex=False ) | ( self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].apply( - lambda x: int(x) >= 270 if str(x).isdigit() else False + lambda x: int(x) >= 200 if str(x).isdigit() else False ) ) | ( self.standardised_asset_list["roof_u_value"].apply( @@ -1152,7 +1147,7 @@ class AssetList: self.standardised_asset_list["solar_epc_loft_needs_topup"] = self.standardised_asset_list[ self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].apply( - lambda x: int(x) < 270 if str(x).isdigit() else False + lambda x: int(x) < 200 if str(x).isdigit() else False ) # TODO: Fill with False - should be temp! @@ -1187,7 +1182,7 @@ class AssetList: ) & ( # We do not utilise estimated EPCs for this method because we will always find that # "epc_has_floor_recommendation" is False - ~self.standardised_asset_list["estimated"] + (self.standardised_asset_list["estimated"] == False) ) ) | ( ( @@ -1212,7 +1207,7 @@ class AssetList: ) & ( # We do not utilise estimated EPCs for this method because we will always find that # "epc_has_floor_recommendation" is False - ~self.standardised_asset_list["estimated"] + self.standardised_asset_list["estimated"] == False ) ) | ( ( @@ -1274,6 +1269,7 @@ class AssetList: ) # Other floor type, fully insulated + self.standardised_asset_list["solar_eligible_other_floor"] = ( # Landlord data or EPC data indicates the heating system is appropriate ( @@ -1332,6 +1328,9 @@ class AssetList: "Empty Cavity (non-intrusives)": ( self.standardised_asset_list["non_intrusive_indicates_empty_cavity"].sum() ), + "Empty Cavity (non-intrusives, no SAP filter)": ( + self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"].sum() + ), "Empty Cavity (EPC)": ( ( self.standardised_asset_list["epc_indicates_empty_cavity"] & @@ -1359,6 +1358,17 @@ class AssetList: ) } + # We produce a breakdown of the property types, for cavity fills + cavity_fills = self.standardised_asset_list[ + self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] | ( + self.standardised_asset_list["epc_indicates_empty_cavity"] + ) + ] + + self.work_type_breakdowns = { + "empty_cavity": cavity_fills[self.STANDARD_PROPERTY_TYPE].value_counts() + } + # Finally, we note why each property has been flagged self.standardised_asset_list["cavity_reason"] = None self.standardised_asset_list["cavity_reason"] = np.where( @@ -1366,6 +1376,11 @@ class AssetList: "Non-Intrusive Data Showed Empty Cavity", self.standardised_asset_list["cavity_reason"] ) + self.standardised_asset_list["cavity_reason"] = np.where( + self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"], + "Non-Intrusive Data Showed Empty Cavity but all SAP scores allowed", + self.standardised_asset_list["cavity_reason"] + ) self.standardised_asset_list["cavity_reason"] = np.where( ( self.standardised_asset_list["epc_indicates_empty_cavity"] & diff --git a/asset_list/app.py b/asset_list/app.py index a24c4043..09ccac02 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -246,22 +246,40 @@ def app(): # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) # - Or the insulation required is loft/cavity (floors should be solid) - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" - data_filename = "Warmfront data- Colchester Borough Homes (Complete).xlsx" + # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" + # data_filename = "Warmfront data- Colchester Borough Homes (Complete).xlsx" + # sheet_name = "Sheet1" + # postcode_column = 'Full Address.1' + # fulladdress_column = "Full Address" + # address1_column = None + # address1_method = "first_word" + # address_cols_to_concat = [] + # missing_postcodes_method = None + # landlord_year_built = "Build Date" + # landlord_os_uprn = None + # landlord_property_type = "Property Type" + # landlord_wall_construction = "Wallinsul" + # landlord_heating_system = "HeatSorc" + # landlord_existing_pv = None + # landlord_property_id = "Property Reference" + + # For Westward + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward" + data_filename = "WESTWARD - completed list..xlsx" sheet_name = "Sheet1" - postcode_column = 'Full Address.1' - fulladdress_column = "Full Address" + postcode_column = "WFT EDIT Postcode" + fulladdress_column = "Address" address1_column = None - address1_method = "first_word" + address1_method = "house_number_extraction" address_cols_to_concat = [] missing_postcodes_method = None - landlord_year_built = "Build Date" - landlord_os_uprn = None - landlord_property_type = "Property Type" - landlord_wall_construction = "Wallinsul" - landlord_heating_system = "HeatSorc" - landlord_existing_pv = None - landlord_property_id = "Property Reference" + landlord_year_built = "Build date" + landlord_os_uprn = "UPRN" + landlord_property_type = "Location type" + landlord_wall_construction = "Wall Construction (EPC)" + landlord_heating_system = "Heat Source" + landlord_existing_pv = "PV (Y/N)" + landlord_property_id = "Place ref" # Maps addresses to uprn in problematic cases MANUAL_UPRN_MAP = {} diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py index 82b31d01..78d64988 100644 --- a/asset_list/mappings/walls.py +++ b/asset_list/mappings/walls.py @@ -84,7 +84,7 @@ WALL_CONSTRUCTION_MAPPINGS = { 'Timber frame, as built, no insulation (assumed)': 'timber frame', 'Timber frame, as built, partial insulation (assumed)': 'timber frame', 'Timber frame, with additional insulation': 'timber frame', - 'CAVITY': 'partial unknown cavity', + 'CAVITY': 'cavity unknown insulation', 'COMB': 'unknown', 'NONE': 'unknown', 'NOTKNOWN': 'unknown', diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py index 15f59c5e..aac0a1a6 100644 --- a/etl/customers/remote_assessments/app.py +++ b/etl/customers/remote_assessments/app.py @@ -4,7 +4,7 @@ from dotenv import load_dotenv from utils.s3 import save_csv_to_s3 from etl.find_my_epc.AssetListEpcData import AssetListEpcData -PORTFOLIO_ID = 133 +PORTFOLIO_ID = 137 USER_ID = 8 load_dotenv(dotenv_path="backend/.env") @@ -19,10 +19,10 @@ def app(): asset_list = [ { - "address": "40", - "postcode": "PE4 5BB", - "uprn": 100090220519, - } + "address": "41 Gainsborough Way", + "postcode": "BA21 5XU", + "uprn": 30016708, + }, ] asset_list = pd.DataFrame(asset_list) @@ -52,8 +52,8 @@ def app(): valuation_data = [ { - "uprn": 100090220519, - "valuation": 135_000 + "uprn": 30016708, + "valuation": 189000 } ] # Store valuation data to s3 diff --git a/recommendations/HeatingRecommender.py b/recommendations/HeatingRecommender.py index c5c07f89..dd81680a 100644 --- a/recommendations/HeatingRecommender.py +++ b/recommendations/HeatingRecommender.py @@ -993,7 +993,7 @@ class HeatingRecommender: # We check if there's a mains connection and the hot water is inefficient, as this will improve with a boiler has_inefficient_water = ( self.property.data["mains-gas-flag"] and - self.property.data["hot-water-energy-eff"] in ["Very Poor", "Poor", "Average"] + self.property.data["hot-water-energy-eff"] in ["Very Poor", "Poor"] ) non_invasive_recommendation = next(( From bb8070967b3f0e8e0234fd07e0428acc9568d208 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 3 Mar 2025 14:38:01 +0000 Subject: [PATCH 72/72] big commit --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- asset_list/AssetList.py | 76 ++++++++++++++++++++++--- asset_list/app.py | 67 +++++++++++----------- backend/Funding.py | 12 ++-- backend/app/plan/router.py | 2 +- etl/customers/remote_assessments/app.py | 41 ++++++++++--- etl/find_my_epc/AssetListEpcData.py | 20 +++++-- recommendations/HeatingRecommender.py | 2 + 9 files changed, 159 insertions(+), 65 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index 96ad7a95..762580d9 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index fb10c6b0..c916a158 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 31b11c66..306edd99 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -344,6 +344,7 @@ class AssetList: self.standardised_asset_list = self.raw_asset_list.copy() # Will be used to store aggregated figures against the various work types self.work_type_figures = {} + self.work_type_breakdowns = {} self.flat_data = None self.duplicated_addresses = None @@ -577,7 +578,7 @@ class AssetList: self.standardised_asset_list[self.landlord_wall_construction] = np.where( self.standardised_asset_list[self.landlord_wall_construction].str.lower().str.contains( "average thermal transmittance" - ), + ) == True, "new build - average thermal transmittance", self.standardised_asset_list[self.landlord_wall_construction] ) @@ -1019,6 +1020,23 @@ class AssetList: ) ) + # Also include work without the SAP filter as optimistic + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] = ( + (self.standardised_asset_list["non-intrusives: Construction"] == "CAVITY") & + (self.standardised_asset_list["non-intrusives: Insulated"].isin(["RETRO DRILLED", "FILLED AT BUILD"])) & + (~self.standardised_asset_list['non-intrusives: Material'].isin( + ["GREY LOOSE BEAD", "COMPACTED BEAD", "FIBRE BATT NO CAVITY", "EMPTY NARROW BELOW 30mm"] + ) + ) + ) + + # Adjust + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] = np.where( + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"], + False, + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] + ) + ###################################################### # Solar ###################################################### @@ -1109,8 +1127,7 @@ class AssetList: ) | ( self.standardised_asset_list[ "walls_u_value"].apply( - lambda x: x <= 0.7 if not pd.isnull( - x) else False + lambda x: x <= 0.7 if not pd.isnull(x) else False ) ) ) @@ -1322,26 +1339,58 @@ class AssetList: # ~self.standardised_asset_list["solar_eligible_other_floor_needs_loft"] ) + blocks_of_flats = self.standardised_asset_list[ + self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] == "block of flats" + ] + + non_blocks_of_flats = self.standardised_asset_list[ + self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] != "block of flats" + ] + # Produce some aggregate figures self.work_type_figures = { # Empty cavity from non-intrusives - "Empty Cavity (non-intrusives)": ( - self.standardised_asset_list["non_intrusive_indicates_empty_cavity"].sum() + "Empty Cavity (non-intrusives)": non_blocks_of_flats["non_intrusive_indicates_empty_cavity"].sum(), + "Empty Cavity (non-intrusives, blocks of flats)": ( + blocks_of_flats["non_intrusive_indicates_empty_cavity"].sum() ), "Empty Cavity (non-intrusives, no SAP filter)": ( - self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"].sum() + non_blocks_of_flats["non_intrusive_indicates_empty_cavity_no_sap_filter"].sum() + ), + "Empty Cavity (non-intrusives, no SAP filter, blocks of flats)": ( + blocks_of_flats["non_intrusive_indicates_empty_cavity_no_sap_filter"].sum() ), "Empty Cavity (EPC)": ( ( - self.standardised_asset_list["epc_indicates_empty_cavity"] & - ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] + non_blocks_of_flats["epc_indicates_empty_cavity"] & + ~non_blocks_of_flats["non_intrusive_indicates_empty_cavity"] + ).sum() + ), + "Empty Cavity (EPC, blocks of flat)": ( + ( + blocks_of_flats["epc_indicates_empty_cavity"] & + ~blocks_of_flats["non_intrusive_indicates_empty_cavity"] ).sum() ), "Cavity Extraction": ( + ( + ~non_blocks_of_flats["non_intrusive_indicates_empty_cavity"] & + ~non_blocks_of_flats["epc_indicates_empty_cavity"] & + non_blocks_of_flats["non_intrusive_indicates_cavity_extraction"] + ).sum() + ), + "Cavity Extraction (blocks of flats)": ( + ( + ~blocks_of_flats["non_intrusive_indicates_empty_cavity"] & + ~blocks_of_flats["epc_indicates_empty_cavity"] & + blocks_of_flats["non_intrusive_indicates_cavity_extraction"] + ).sum() + ), + "Cavity Extraction (no SAP filter)": ( ( ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] & ~self.standardised_asset_list["epc_indicates_empty_cavity"] & - self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] ).sum() ), "Solar PV (Solid Floor)": ( @@ -1398,6 +1447,15 @@ class AssetList: "Non-Intrusive Data Showed Cavity Extraction", self.standardised_asset_list["cavity_reason"] ) + # extraction no sap filter + self.standardised_asset_list["cavity_reason"] = np.where( + ( + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] & + pd.isnull(self.standardised_asset_list["cavity_reason"]) + ), + "Non-Intrusive Data Showed Cavity Extraction but all SAP scores allowed", + self.standardised_asset_list["cavity_reason"] + ) # Flag solar self.standardised_asset_list["solar_reason"] = None diff --git a/asset_list/app.py b/asset_list/app.py index 09ccac02..84999e93 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -246,43 +246,43 @@ def app(): # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) # - Or the insulation required is loft/cavity (floors should be solid) - # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" - # data_filename = "Warmfront data- Colchester Borough Homes (Complete).xlsx" - # sheet_name = "Sheet1" - # postcode_column = 'Full Address.1' - # fulladdress_column = "Full Address" - # address1_column = None - # address1_method = "first_word" - # address_cols_to_concat = [] - # missing_postcodes_method = None - # landlord_year_built = "Build Date" - # landlord_os_uprn = None - # landlord_property_type = "Property Type" - # landlord_wall_construction = "Wallinsul" - # landlord_heating_system = "HeatSorc" - # landlord_existing_pv = None - # landlord_property_id = "Property Reference" - - # For Westward - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward" - data_filename = "WESTWARD - completed list..xlsx" + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" + data_filename = "Warmfront data- Colchester Borough Homes (Complete).xlsx" sheet_name = "Sheet1" - postcode_column = "WFT EDIT Postcode" - fulladdress_column = "Address" + postcode_column = 'Full Address.1' + fulladdress_column = "Full Address" address1_column = None - address1_method = "house_number_extraction" + address1_method = "first_word" address_cols_to_concat = [] missing_postcodes_method = None - landlord_year_built = "Build date" - landlord_os_uprn = "UPRN" - landlord_property_type = "Location type" - landlord_wall_construction = "Wall Construction (EPC)" - landlord_heating_system = "Heat Source" - landlord_existing_pv = "PV (Y/N)" - landlord_property_id = "Place ref" + landlord_year_built = "Build Date" + landlord_os_uprn = None + landlord_property_type = "Property Type" + landlord_wall_construction = "Wallinsul" + landlord_heating_system = "HeatSorc" + landlord_existing_pv = None + landlord_property_id = "Property Reference" + + # For Westward + # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward" + # data_filename = "WESTWARD - completed list..xlsx" + # sheet_name = "Sheet1" + # postcode_column = "WFT EDIT Postcode" + # fulladdress_column = "Address" + # address1_column = None + # address1_method = "house_number_extraction" + # address_cols_to_concat = [] + # missing_postcodes_method = None + # landlord_year_built = "Build date" + # landlord_os_uprn = "UPRN" + # landlord_property_type = "Location type" + # landlord_wall_construction = "Wall Construction (EPC)" + # landlord_heating_system = "Heat Source" + # landlord_existing_pv = "PV (Y/N)" + # landlord_property_id = "Place ref" # Maps addresses to uprn in problematic cases - MANUAL_UPRN_MAP = {} + manual_uprn_map = {} asset_list = AssetList( local_filepath=os.path.join(data_folder, data_filename), @@ -352,7 +352,7 @@ def app(): epc_data_chunk, errors_chunk, no_epc_chunk = get_data( df=chunk, row_id_name=asset_list.DOMNA_PROPERTY_ID, - manual_uprn_map=MANUAL_UPRN_MAP, + manual_uprn_map=manual_uprn_map, ) # We now retrieve any failed properties @@ -360,7 +360,7 @@ def app(): epc_data_failed, _, _ = get_data( df=chunk_failed, row_id_name=asset_list.DOMNA_PROPERTY_ID, - manual_uprn_map=MANUAL_UPRN_MAP, + manual_uprn_map=manual_uprn_map, epc_api_only=False ) @@ -464,6 +464,7 @@ def app(): ) cleaned = msgpack.unpackb(cleaned, raw=False) + # TODO: We should break out the identification of work types to flag blocks of flats specifically asset_list.identify_worktypes(cleaned) pprint(asset_list.work_type_figures) diff --git a/backend/Funding.py b/backend/Funding.py index f0780c51..2839c7ff 100644 --- a/backend/Funding.py +++ b/backend/Funding.py @@ -149,7 +149,8 @@ class Funding: :return: """ measure_table = pd.DataFrame([ - m for m in self.recommendations if m in measures and m["default"] + m for m in self.recommendations if + (m["type"] in measures) or (m["measure_type"] in measures) and m["default"] ]) measure_table["post_install_sap"] = measure_table["sap_points"] + self.starting_sap @@ -180,13 +181,10 @@ class Funding: measure_table["cost_minus_funding"] = measure_table["total"] - measure_table["estimated_funding"] measure_table["cost_minus_funding_per_sap"] = measure_table["cost_minus_funding"] / measure_table["sap_points"] measure_table = measure_table.sort_values(["cost_minus_funding_per_sap", "total"], ascending=[True, False]) - # Recommend the measure, with estimated funding amount - recommended_measure = measure_table.head(1) - return { - "measure_type": recommended_measure["measure_type"], - "estimated_funding": recommended_measure["estimated_funding"] - } + return measure_table[ + ["type", "measure_type", "Cost Savings", "estimated_funding"] + ].rename(columns={"Cost Savings": "project_score"}).to_dict("records") def sap_to_eco_band(self, sap_points): """ diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 76c172ee..d82e774b 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -825,7 +825,7 @@ async def trigger_plan(body: PlanTriggerRequest): property_recommendations=recommendations[p.id], project_scores_matrix=eco_project_scores_matrix, whlg_eligible_postcodes=whlg_eligible_postcodes, - gbis_abs_rate=20, + gbis_abs_rate=15, eco4_abs_rate=15, ) funding_calulator.check_eligibiltiy() diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py index aac0a1a6..fc3b7ec6 100644 --- a/etl/customers/remote_assessments/app.py +++ b/etl/customers/remote_assessments/app.py @@ -4,7 +4,7 @@ from dotenv import load_dotenv from utils.s3 import save_csv_to_s3 from etl.find_my_epc.AssetListEpcData import AssetListEpcData -PORTFOLIO_ID = 137 +PORTFOLIO_ID = 134 USER_ID = 8 load_dotenv(dotenv_path="backend/.env") @@ -19,10 +19,25 @@ def app(): asset_list = [ { - "address": "41 Gainsborough Way", - "postcode": "BA21 5XU", - "uprn": 30016708, + "address": "Flat 2, 42 Malden Road, London NW5 3HG", + "postcode": "NW5 3HG", + "uprn": 5117165, }, + { + "address": "15 Bournville Lane", + "postcode": "B30 2JY", + "uprn": 100070301128 + }, + { + "address": "34 Bournville Lane", + "postcode": "B30 2LN", + "uprn": 100070301140 + }, + { + "address": "36 Bournville Lane", + "postcode": "B30 2LN", + "uprn": 100070301142 + } ] asset_list = pd.DataFrame(asset_list) @@ -52,9 +67,21 @@ def app(): valuation_data = [ { - "uprn": 30016708, - "valuation": 189000 - } + "uprn": 5117165, + "valuation": 467_000 + }, + { + "uprn": 100070301128, + "valuation": 335_000 + }, + { + "uprn": 100070301140, + "valuation": 276_000 + }, + { + "uprn": 100070301142, + "valuation": 276_000 + }, ] # Store valuation data to s3 valuation_filename = f"{USER_ID}/{PORTFOLIO_ID}/valuation.csv" diff --git a/etl/find_my_epc/AssetListEpcData.py b/etl/find_my_epc/AssetListEpcData.py index bce8cd1f..1d2e1472 100644 --- a/etl/find_my_epc/AssetListEpcData.py +++ b/etl/find_my_epc/AssetListEpcData.py @@ -72,12 +72,20 @@ class AssetListEpcData: epc_searcher.find_property(skip_os=True) if epc_searcher.newest_epc is None: continue - - find_epc_searcher = RetrieveFindMyEpc( - address=epc_searcher.newest_epc["address1"], - postcode=epc_searcher.newest_epc["postcode"] - ) - find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + # Attempt both methods: + try: + find_epc_searcher = RetrieveFindMyEpc( + address=epc_searcher.newest_epc["address1"] + ", " + epc_searcher.newest_epc["address2"], + postcode=epc_searcher.newest_epc["postcode"] + ) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + except Exception as e: + logger.error(f"Error retrieving find my epc data: {e}") + find_epc_searcher = RetrieveFindMyEpc( + address=epc_searcher.newest_epc["address1"], + postcode=epc_searcher.newest_epc["postcode"] + ) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() time.sleep(0.5) # We need uprn diff --git a/recommendations/HeatingRecommender.py b/recommendations/HeatingRecommender.py index dd81680a..e4dd3a78 100644 --- a/recommendations/HeatingRecommender.py +++ b/recommendations/HeatingRecommender.py @@ -852,6 +852,8 @@ class HeatingRecommender: else: heating_simulation_config["mainheat_energy_eff_ending"] = self.property.data["mainheat-energy-eff"] + # TODO:We possibly shouldn't touch the hot water energy efficiency if we aren't recommending dual immersion + # we'll keep this for the moment though if self.property.data["hot-water-energy-eff"] in ["Very Poor", "Poor"]: heating_simulation_config["hot_water_energy_eff_ending"] = "Average" else: