diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 611d0257..945b5e4e 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -2622,7 +2622,8 @@ class AssetList: # Add in deal and pipeline information programme_data["dealname"] = ( - programme_data[self.STANDARD_FULL_ADDRESS] + " : " + programme_data["domna_product"] + programme_data[self.STANDARD_FULL_ADDRESS] + ", " + + programme_data[self.STANDARD_POSTCODE] + " : " + programme_data["domna_product"] ) programme_data['Pipeline '] = hubspot_config.CRM_PIPELINE_NAME programme_data['Associations: Listing'] = "Property Owner" @@ -2656,7 +2657,11 @@ class AssetList: # Ammend the property type and built form columns programme_data["hubspot_property_type"] = programme_data[self.STANDARD_PROPERTY_TYPE].copy() - programme_data["hubspot_built_form"] = programme_data[self.STANDARD_BUILT_FORM].copy() + # We don't already have this + if self.STANDARD_BUILT_FORM in programme_data.columns: + programme_data["hubspot_built_form"] = programme_data[self.STANDARD_BUILT_FORM].copy() + else: + programme_data["hubspot_built_form"] = None def _replace_property_description_data(programme_data, column_name): """ diff --git a/asset_list/abs_estimates.py b/asset_list/abs_estimates.py index 58adcca6..0cd82dc6 100644 --- a/asset_list/abs_estimates.py +++ b/asset_list/abs_estimates.py @@ -13,10 +13,22 @@ from backend.app.utils import sap_to_epc load_dotenv(dotenv_path="backend/.env") EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") +# project = pd.read_excel( + +# ) +# +# cavity = project[project["cavity_reason"].isin( +# ["EPC Shows Empty Cavity: SAP Rating 54 or less", "EPC Shows Empty Cavity: SAP Rating 55-68"] +# )] + asset_list = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Instagroup Review/Livewest South-West - Standardised V2.xlsx", - sheet_name="Cavity Route (Insta Review)" + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Abri/Abs Rates/Desktop ABRI data - Standardised After " + "Programmes.xlsx", + sheet_name="Reviewed List" ) +asset_list = asset_list[asset_list["cavity_reason"].isin( + ["EPC Shows Empty Cavity: SAP Rating 54 or less", "EPC Shows Empty Cavity: SAP Rating 55-68"] +)] abs_matrix = pd.read_csv( "/Users/khalimconn-kowlessar/Downloads/ECO4 Full Project Scores Matrix.csv" @@ -30,7 +42,7 @@ pps_matrix.columns = [c.strip() for c in pps_matrix.columns] # We need to estimate the number of points the work will produce and the finishing band. For this, we assume 7 for # cavity and 15 for solar. We'll be more specific in the future, but for now, this is a good enough estimate. route = asset_list[["domna_address_1", "domna_postcode", "epc_os_uprn"]].rename( - columns={"domna_address_1": "address", "domna_postcode": "postcode", "epc_os_uprn": "upr"} + columns={"domna_address_1": "address", "domna_postcode": "postcode", "epc_os_uprn": "uprn"} ) route["address"] = route["address"].astype(str) @@ -42,120 +54,121 @@ asset_list_epc_client = AssetListEpcData( asset_list_epc_client.get_data() asset_list_epc_client.get_non_invasive_recommendations() -solar_sap_points = [] -for r in asset_list_epc_client.non_invasive_recommendations: - if not r.get("recommendations"): - continue - solar_recommendations = [ - x for x in r["recommendations"] if "solar_pv" in x["type"] - ] - if solar_recommendations: - solar_recommendations = solar_recommendations[0] - else: - continue - - address = r["address"] - postcode = r["postcode"] - - solar_sap_points.append( - { - "address": address, - "postcode": postcode, - "sap_points": solar_recommendations["sap_points"] - } - ) - -solar_sap_points = pd.DataFrame(solar_sap_points) -solar_sap_points.drop_duplicates(subset=["address", "postcode"], inplace=True) -# Store the sap points in the cavity route to csv -# cwi_sap_points.to_csv( -# "/Users/khalimconn-kowlessar/Documents/hestia/Instagroup Review/cwi_sap_points_livewest_sw.csv", -# index=False -# ) - -avg_solar_points_by_postcode = solar_sap_points.groupby(["postcode"]).agg({"sap_points": "mean"}).reset_index() -avg_solar_points = solar_sap_points["sap_points"].median() -asset_list["domna_address_1"] = asset_list["domna_address_1"].astype(str) -asset_list = asset_list.merge( - solar_sap_points, how="left", left_on=["domna_address_1", "domna_postcode"], right_on=["address", "postcode"] -).drop( - columns=["address", "postcode"] -) - -# Fill the sap points with the average cwi points -asset_list = asset_list.merge( - avg_solar_points_by_postcode.rename(columns={"postcode": "domna_postcode"}), - how="left", on=["domna_postcode"], suffixes=("", "_avg") -) -asset_list["sap_points"] = asset_list["sap_points"].fillna(asset_list["sap_points_avg"]) -asset_list.drop(columns=["sap_points_avg"], inplace=True) - -asset_list["sap_points"] = asset_list["sap_points"].fillna(avg_solar_points) -asset_list["post_works_sap"] = asset_list["epc_sap_score_on_register"] + asset_list["sap_points"] -asset_list["post_works_epc"] = asset_list["post_works_sap"].apply(lambda x: sap_to_epc(x)) -asset_list["starting_half_band"] = asset_list["epc_sap_score_on_register"].apply(lambda x: Funding.get_sap_band(x)) -asset_list["ending_half_band"] = asset_list["post_works_sap"].apply(lambda x: Funding.get_sap_band(x)) -asset_list["floor_area_band"] = asset_list["epc_total_floor_area"].apply(lambda x: Funding.get_floor_area_band(x)) - -asset_list["ending_half_band"] = np.where( - (asset_list["post_works_epc"] == asset_list["epc_rating_on_register"]), - "Low_C", - asset_list["ending_half_band"] -) -# Realistically, we'll take the properties to a low C at worst -asset_list["ending_half_band"] = np.where( - (asset_list["post_works_sap"] < 69), - "Low_C", - asset_list["ending_half_band"] -) - -asset_list = asset_list.merge( - abs_matrix, how="left", left_on=["starting_half_band", "ending_half_band", "floor_area_band"], - right_on=['Starting Band', 'Finishing Band', 'Floor Area Segment', ] -) -asset_list = asset_list.drop(columns=['Starting Band', 'Finishing Band', 'Floor Area Segment']) - -asset_list = asset_list.rename( - columns={"Cost Savings": "funding_abs"} -) - -print(asset_list["domna_property_id"].duplicated().sum()) - -# Store this data -asset_list.to_csv( - "/Users/khalimconn-kowlessar/Documents/hestia/Instagroup Review/livewest_sw_solar_abs_estimates-solar.csv", - index=False -) - -# Cavity process! -# cwi_sap_points = [] +# solar_sap_points = [] # for r in asset_list_epc_client.non_invasive_recommendations: # if not r.get("recommendations"): # continue -# cwi_recommendations = [ -# x for x in r["recommendations"] if "cavity_wall_insulation" in x["type"] +# solar_recommendations = [ +# x for x in r["recommendations"] if "solar_pv" in x["type"] # ] -# if cwi_recommendations: -# cwi_recommendations = cwi_recommendations[0] +# if solar_recommendations: +# solar_recommendations = solar_recommendations[0] # else: # continue # # address = r["address"] # postcode = r["postcode"] # -# cwi_sap_points.append( +# solar_sap_points.append( # { # "address": address, # "postcode": postcode, -# "sap_points": cwi_recommendations["sap_points"] +# "sap_points": solar_recommendations["sap_points"] # } # ) # -# cwi_sap_points = pd.DataFrame(cwi_sap_points) -# cwi_sap_points = pd.read_csv( -# "/Users/khalimconn-kowlessar/Documents/hestia/Instagroup Review/cwi_sap_points_livewest_sw.csv" +# solar_sap_points = pd.DataFrame(solar_sap_points) +# solar_sap_points.drop_duplicates(subset=["address", "postcode"], inplace=True) +# # Store the sap points in the cavity route to csv +# solar_sap_points.to_csv( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Abri/Abs Rates/cwi_sap_points.csv", +# index=False # ) -# cwi_sap_points.drop_duplicates(subset=["address", "postcode"], inplace=True) +# +# avg_solar_points_by_postcode = solar_sap_points.groupby(["postcode"]).agg({"sap_points": "mean"}).reset_index() +# avg_solar_points = solar_sap_points["sap_points"].median() +# asset_list["domna_address_1"] = asset_list["domna_address_1"].astype(str) +# asset_list = asset_list.merge( +# solar_sap_points, how="left", left_on=["domna_address_1", "domna_postcode"], right_on=["address", "postcode"] +# ).drop( +# columns=["address", "postcode"] +# ) +# +# # Fill the sap points with the average cwi points +# asset_list = asset_list.merge( +# avg_solar_points_by_postcode.rename(columns={"postcode": "domna_postcode"}), +# how="left", on=["domna_postcode"], suffixes=("", "_avg") +# ) +# asset_list["sap_points"] = asset_list["sap_points"].fillna(asset_list["sap_points_avg"]) +# asset_list.drop(columns=["sap_points_avg"], inplace=True) +# +# asset_list["sap_points"] = asset_list["sap_points"].fillna(avg_solar_points) +# asset_list["post_works_sap"] = asset_list["epc_sap_score_on_register"] + asset_list["sap_points"] +# asset_list["post_works_epc"] = asset_list["post_works_sap"].apply(lambda x: sap_to_epc(x)) +# asset_list["starting_half_band"] = asset_list["epc_sap_score_on_register"].apply(lambda x: Funding.get_sap_band(x)) +# asset_list["ending_half_band"] = asset_list["post_works_sap"].apply(lambda x: Funding.get_sap_band(x)) +# asset_list["floor_area_band"] = asset_list["epc_total_floor_area"].apply(lambda x: Funding.get_floor_area_band(x)) +# +# asset_list["ending_half_band"] = np.where( +# (asset_list["post_works_epc"] == asset_list["epc_rating_on_register"]), +# "Low_C", +# asset_list["ending_half_band"] +# ) +# # Realistically, we'll take the properties to a low C at worst +# asset_list["ending_half_band"] = np.where( +# (asset_list["post_works_sap"] < 69), +# "Low_C", +# asset_list["ending_half_band"] +# ) +# +# asset_list = asset_list.merge( +# abs_matrix, how="left", left_on=["starting_half_band", "ending_half_band", "floor_area_band"], +# right_on=['Starting Band', 'Finishing Band', 'Floor Area Segment', ] +# ) +# asset_list = asset_list.drop(columns=['Starting Band', 'Finishing Band', 'Floor Area Segment']) +# +# asset_list = asset_list.rename( +# columns={"Cost Savings": "funding_abs"} +# ) +# +# print(asset_list["domna_property_id"].duplicated().sum()) +# +# # Store this data +# asset_list.to_csv( +# "/Users/khalimconn-kowlessar/Documents/hestia/Instagroup Review/livewest_sw_solar_abs_estimates-solar.csv", +# index=False +# ) + +# Cavity process! +cwi_sap_points = [] +for r in asset_list_epc_client.non_invasive_recommendations: + if not r.get("recommendations"): + continue + cwi_recommendations = [ + x for x in r["recommendations"] if "cavity_wall_insulation" in x["type"] + ] + if cwi_recommendations: + cwi_recommendations = cwi_recommendations[0] + else: + continue + + address = r["address"] + postcode = r["postcode"] + + cwi_sap_points.append( + { + "address": address, + "postcode": postcode, + "type": cwi_recommendations["type"], + "sap_points": cwi_recommendations["sap_points"] + } + ) + +cwi_sap_points = pd.DataFrame(cwi_sap_points) +cwi_sap_points = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Abri/Abs Rates/cwi_sap_points.csv", +) +cwi_sap_points.drop_duplicates(subset=["address", "postcode"], inplace=True) avg_cwi_points_by_postcode = cwi_sap_points.groupby(["postcode"]).agg({"sap_points": "mean"}).reset_index() avg_cwi_points = cwi_sap_points["sap_points"].median() asset_list = asset_list.merge( @@ -186,13 +199,22 @@ asset_list["funding_scheme"] = np.where( "GBIS", "ECO4" ) +# Note - anything that is EPC E or below that doesn't go up to a C will be GBIS +# To detect this, if the starting sap score is 54 or below and the endding SAP sore is 68 or below +# we will assume it is GBIS +asset_list["funding_scheme"] = np.where( + (asset_list["post_works_sap"] < 69) & (asset_list["epc_sap_score_on_register"] < 55), + "GBIS", + asset_list["funding_scheme"] +) + asset_list = asset_list.merge( abs_matrix, how="left", left_on=["starting_half_band", "ending_half_band", "floor_area_band"], right_on=['Starting Band', 'Finishing Band', 'Floor Area Segment', ] ) asset_list = asset_list.drop(columns=['Starting Band', 'Finishing Band', 'Floor Area Segment']) -# Using CWI solid 1.7 -> 0.3 rates +# Using CWI 0.033 as the partial project score cwi_pps_matrix = pps_matrix[ pps_matrix["Measure_Type"].isin(["CWI_0.033"]) ] @@ -220,10 +242,26 @@ asset_list["funding_abs"] = np.where( asset_list["Cost Savings"] ) -asset_list["domna_property_id"].duplicated().sum() +from recommendations.recommendation_utils import ( + estimate_external_wall_area, +) + +# For some reason, estimated insulation wall area is missing +asset_list["estimated_insulation_wall_area"] = asset_list.apply( + lambda x: estimate_external_wall_area( + num_floors=x["attribute_est_number_floors"], + floor_height=( + float(x["epc_floor_height"]) if + not pd.isnull(x["epc_floor_height"]) else 2.5 + ), + perimeter=x["attribute_est_perimter"], + built_form=x["epc_archetype"] + ), + axis=1 +) # Store this data asset_list.to_csv( - "/Users/khalimconn-kowlessar/Documents/hestia/Instagroup Review/livewest_sw_abs_estimates.csv", + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Abri/Abs Rates/Abri CWI ABS Estimates.csv", index=False ) diff --git a/asset_list/app.py b/asset_list/app.py index efc9cf44..37d9ae0d 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -60,11 +60,11 @@ def app(): """ # TODO: Delete me - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild" - data_filename = "Bromford Asset List.xlsx" - sheet_name = "Asset List" - postcode_column = 'PostCode' - fulladdress_column = "FullAddress" + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/NRLA/" + data_filename = "20250716 Asset List.xlsx" + sheet_name = "Sheet 1" + postcode_column = 'Postcode' + fulladdress_column = "Full Address" address1_column = None address1_method = "house_number_extraction" address_cols_to_concat = [] @@ -76,24 +76,93 @@ def app(): landlord_wall_construction = None landlord_heating_system = None landlord_existing_pv = None - landlord_property_id = "Asset" + landlord_property_id = "Row ID" outcomes_filename = [] outcomes_sheetname = [] outcomes_postcode = [] outcomes_houseno = [] outcomes_address = [] - outcomes_id = [None] - master_filepaths = [os.path.join("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/", - "Needs ID/SOLAR PV ONLY-Table 1.csv")] + outcomes_id = [] + master_filepaths = [] master_to_asset_list_filepath = None asset_list_header = 0 landlord_block_reference = None - master_id_colnames = [None] + master_id_colnames = [] landlord_roof_construction = None phase = False landlord_sap = None ecosurv_landlords = None + # Southend + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Southend/July 2025 Programme" + data_filename = "SOUTHEND - RYAN.xlsx" + sheet_name = "July 2025 Surveys" + postcode_column = 'Postcode' + fulladdress_column = "Full postal address" + address1_column = None + address1_method = "house_number_extraction" + address_cols_to_concat = [] + missing_postcodes_method = None + landlord_year_built = "Property age" + landlord_os_uprn = None + landlord_property_type = "Property type" + landlord_built_form = "Property type" + landlord_wall_construction = None + landlord_heating_system = None + landlord_existing_pv = None + landlord_property_id = "ID" + outcomes_filename = [] + outcomes_sheetname = [] + outcomes_postcode = [] + outcomes_houseno = [] + outcomes_address = [] + outcomes_id = [] + master_filepaths = [] + master_to_asset_list_filepath = None + asset_list_header = 0 + landlord_block_reference = None + master_id_colnames = [] + landlord_roof_construction = None + phase = False + landlord_sap = None + ecosurv_landlords = None + + # For Rooftop + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Rooftop" + data_filename = "Rooftop Asset List - July 2025.xlsx" + sheet_name = "Sheet1" + postcode_column = 'post_code' + fulladdress_column = None + address1_column = "add_1" + address1_method = None + address_cols_to_concat = [ + "add_1", "add_2", "add_3", "add_4" + ] + missing_postcodes_method = None + landlord_year_built = "date_built" + landlord_os_uprn = None + landlord_property_type = "ConstructionStyle" + landlord_built_form = "ConstructionStyle" + landlord_wall_construction = None + landlord_heating_system = "Description" + landlord_existing_pv = None + landlord_property_id = "PropertyCode" + outcomes_filename = [os.path.join(data_folder, "Rooftop_Outcomes.xlsx")] + outcomes_sheetname = ["OUTCOMESs"] + outcomes_postcode = ["POSTCODE"] + outcomes_houseno = ["NO"] + outcomes_address = ["ADDRESS"] + outcomes_id = [None] + master_filepaths = [os.path.join(data_folder, "Master.csv")] + master_to_asset_list_filepath = None + asset_list_header = 1 + landlord_block_reference = "bl_rec_ref" + master_id_colnames = [None] + landlord_roof_construction = None + phase = False + landlord_sap = None + ecosurv_landlords = "rooftop" + # For Housing data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/For Housing/New Programme July 2025" data_filename = "FOR HOUSING Asset List (Combined).xlsx" diff --git a/asset_list/hubspot/prepare_for_hubspot.py b/asset_list/hubspot/prepare_for_hubspot.py index ba2a2d23..56ce37ed 100644 --- a/asset_list/hubspot/prepare_for_hubspot.py +++ b/asset_list/hubspot/prepare_for_hubspot.py @@ -45,13 +45,13 @@ def app(): # inputs: reconcile_programme = True # If True, the hubspot upload will include all properties with a project code - customer_domain = "https://calico.org.uk" - installer_name = "WARM FRONT" + customer_domain = "https://southend.gov.uk" + installer_name = "J & J CRUMP" asset_list_filepath = ( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Calico/Hubspot/07.04 CALICO - Final List - " - "Standardised.xlsx" + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Southend/July 2025 Programme/SOUTHEND - RYAN - " + "Standardised 2.xlsx" ) - asset_list_sheet_name = "Final Route March" + asset_list_sheet_name = "Standardised Asset List" asset_list_header = 0 contact_details_filepath = None @@ -107,7 +107,7 @@ def app(): raise ValueError("FIX MEEE") if pd.isnull(asset_list.hubspot_data['Deal Stage ']).any(): - raise ValueError("Warning: Some rows have missing project codes. These will not be uploaded to HubSpot.") + raise ValueError("Warning: Some rows have missing deal stage. These will not be uploaded to HubSpot.") # Just store locally asset_list.hubspot_data.to_csv(output_filepath, index=False, encoding="utf-8-sig") diff --git a/asset_list/mappings/built_form.py b/asset_list/mappings/built_form.py index 4ebe016f..c9cd061f 100644 --- a/asset_list/mappings/built_form.py +++ b/asset_list/mappings/built_form.py @@ -384,6 +384,7 @@ BUILT_FORM_MAPPINGS = { 'Cottage Flat': 'ground floor', 'Maisonette Over Shop': 'mid-floor', 'Medium Rise Flat': 'mid-floor', - 'Maisonette Medium Rise': 'unknown' + 'Maisonette Medium Rise': 'unknown', + 'End-terraced house': 'end-terrace' } diff --git a/backend/app/plan/schemas.py b/backend/app/plan/schemas.py index ef73f133..2a388b2f 100644 --- a/backend/app/plan/schemas.py +++ b/backend/app/plan/schemas.py @@ -102,3 +102,9 @@ class PlanTriggerRequest(BaseModel): # If true, before optimising the engine will select a slightly larger package, to account for the SAP 10 causing # scores to drop by a few points simulate_sap_10: Optional[bool] = False + + # Add in optional fields which describe the format of the asset list being used + + file_type: Optional[Literal["csv", "xlsx"]] = None, + file_format: Optional[Literal["domna_asset_list"]] = None, + sheet_name: Optional[str] = None diff --git a/etl/customers/abri/abs_rates.py b/etl/customers/abri/abs_rates.py new file mode 100644 index 00000000..f9f2f98e --- /dev/null +++ b/etl/customers/abri/abs_rates.py @@ -0,0 +1,12 @@ +import pandas as pd + +project = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Abri/Abs Rates/Desktop ABRI data - Standardised After " + "Programmes.xlsx" +) + +cavity = project[project["cavity_reason"].isin( + ["EPC Shows Empty Cavity: SAP Rating 54 or less", "EPC Shows Empty Cavity: SAP Rating 55-68"] +)] + +# Pull the data diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py deleted file mode 100644 index 95fe4fcd..00000000 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ /dev/null @@ -1,4293 +0,0 @@ -import os -from urllib import parse -from fuzzywuzzy import fuzz - -import PyPDF2 -import re -import pandas as pd -import numpy as np -from tqdm import tqdm -from collections import Counter -from scipy.optimize import linprog - -from SearchEpc import SearchEpc -from utils.s3 import read_pickle_from_s3 - -CUSTOMER_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater" -SURVEY_FOLDERS = os.path.join(CUSTOMER_FOLDER_PATH, "StonewaterSurveys_{i}") -NUM_FOLDERS = 15 - - -def sap_to_epc(sap_points: int | float): - """ - Simple utility function to convert SAP points to EPC rating. - :param sap_points: numerical value of SAP points, typically between 0 and 100 - :return: - """ - - if sap_points <= 0: - raise ValueError("SAP points should be above 0.") - - if sap_points >= 92: - return "A" - elif sap_points >= 81: - return "B" - elif sap_points >= 69: - return "C" - elif sap_points >= 55: - return "D" - elif sap_points >= 39: - return "E" - elif sap_points >= 21: - return "F" - else: - return "G" - - -def extract_wall_details_summary(text): - """ - Extracts wall type, insulation, dry-lining, and thickness for each building part, - including any alternative wall details within the 7.0 Walls section of the summary PDF text. - """ - # Define data structure to hold all building part wall entries - wall_data = [] - - # Locate the entire 7.0 Walls section - wall_section = re.search(r"7\.0 Walls:\n(.*?)\n8\.0 Roofs:", text, re.DOTALL).group(1) - - # Define pattern to match each building part's wall entry within the section - building_part_pattern = re.compile( - r"(Main Property|1st Extension|2nd Extension|[\w\s]+)\n" # Matches each building part label - r"Type\s+(.*?)\n" # Matches main wall Type - r"Insulation\s+(.*?)\n" # Matches main wall Insulation - r"(Dry-lining\s+(.*?)\n)?" # Optional main wall Dry-lining - r"Wall Thickness Unknown\s+(.*?)\n" # Matches main wall Thickness Unknown - r"Wall Thickness \[mm\]\s+(\d+)", # Matches main wall Thickness - re.DOTALL - ) - - # Define pattern to capture alternative wall details, if present - alternative_wall_pattern = re.compile( - r"Alternative Wall Area.*?\n" # Matches start of alternative wall section - r"Alternative Type\s+(.*?)\n" # Matches alternative wall Type - r"Alternative Insulation\s+(.*?)\n" # Matches alternative wall Insulation - r"(Alternative Dry-lining\s+(.*?)\n)?" # Optional Alternative Dry-lining - r"Alternative Wall Thickness Unknown\s+(.*?)\n" # Matches alternative wall Thickness Unknown - r"Alternative Wall Thickness\s+(\d+)", # Matches alternative wall Thickness - re.DOTALL - ) - - # Find all building part entries within the 7.0 Walls section - for match in building_part_pattern.finditer(wall_section): - wall_label = match.group(1).strip() - main_wall_type = match.group(2).strip() - main_wall_insulation = match.group(3).strip() - main_wall_dry_lining = match.group(5).strip() if match.group(5) else "N/A" - main_wall_thickness_unknown = match.group(6).strip() - main_wall_thickness = int(match.group(7)) - - # Initialize dictionary for this wall entry - wall_entry = { - "Building Part": wall_label, - "Wall Type": main_wall_type, - "Wall Insulation": main_wall_insulation, - "Wall Dry-lining": main_wall_dry_lining, - "Wall Thickness Unknown": main_wall_thickness_unknown, - "Wall Thickness (mm)": main_wall_thickness, - "Alternative Wall Type": None, - "Alternative Wall Insulation": None, - "Alternative Wall Dry-lining": "N/A", - "Alternative Wall Thickness Unknown": None, - "Alternative Wall Thickness (mm)": None, - } - - # Check if there's an alternative wall section following this wall entry - alt_match = alternative_wall_pattern.search(wall_section, match.end()) - if alt_match: - wall_entry["Alternative Wall Type"] = alt_match.group(1).strip() - wall_entry["Alternative Wall Insulation"] = alt_match.group(2).strip() - wall_entry["Alternative Wall Dry-lining"] = alt_match.group(4).strip() if alt_match.group(4) else "N/A" - wall_entry["Alternative Wall Thickness Unknown"] = alt_match.group(5).strip() - wall_entry["Alternative Wall Thickness (mm)"] = int(alt_match.group(6)) - - # Append each building part as a dictionary in the wall_data list - wall_data.append(wall_entry) - - return wall_data - - -def extract_summary_report(pdf_path): - """ - Extracts specific data from the provided PDF file. - Data includes: - - Current SAP rating - - Fuel Bill - - Address - """ - - data = { - "Address": None, - "Postcode": None, - "Current SAP Rating": None, - "Current EPC Band": None, - "Fuel Bill": None, - "Main Building Age Band": None, - "Number of Storeys": None, - "Window Age Description": None, - "Window Age Description Proportion (%)": None, - "Secondary Window Age Description": None, - "Secondary Window Age Description Proportion (%)": None, - "Number of Windows": None, - "Total Number of Doors": None, - "Number of Insulated Doors": None, - "Existing Primary Heating System": None, - "Existing Primary Heating PCDF Reference": None, - "Existing Primary Heating Controls": None, - "Existing Primary Heating % of Heat": None, - "Existing Secondary Heating System": None, - "Existing Secondary Heating PCDF Reference": None, - "Existing Secondary Heating Controls": None, - "Existing Secondary Heating % of Heat": None, - "Secondary Heating Code": None, - "Water Heating Code": None, - 'Total Floor Area (m2)': None, - 'Total Ground Floor Area (m2)': None, - 'RIR Floor Area': None, - 'Main Building Wall Area (m2)': None, - 'First Extension Wall Area (m2)': None, - "Number of Light Fittings": None, - "Number of LEL Fittings": None, - "Number of fittings needing LEL": None, - "Main Roof Type": None, - "Main Roof Insulation": None, - "Main Roof Insulation Thickness": None, - "Main Wall Type": None, - "Main Wall Insulation": None, - "Main Wall Dry-lining": None, - "Main Wall Thickness": None, - "Main Building Alternative Wall Type": None, - "Main Building Alternative Wall Insulation": None, - "Main Building Alternative Wall Dry-lining": None, - "Main Building Alternative Wall Thickness": None, - } - - with (open(pdf_path, "rb") as file): - reader = PyPDF2.PdfReader(file) - text = "" - for page in reader.pages: - text += page.extract_text() - - # Extract Current SAP rating - sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text) - data["Current SAP Rating"] = sap_match.group(1).split(" ")[1] - - # Extract age - age_band_match = re.search( - r"3\.0 Date Built:\s*Main Property\s*[A-Z]?\s*(\d{4}-\d{4}|before \d{4}|\d{4} onwards)", - text - ) - data["Main Building Age Band"] = age_band_match.group(1) - - # Number of storeys - storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) - data["Number of Storeys"] = int(storeys_match.group(1)) - - # Extract Carbon Emissions - # carbon_match = re.search(r"Emissions \(t/year\):\s*([\d.]+)\s*tonnes", text) - # data["Carbon Emissions (t/year)"] = float(carbon_match.group(1)) - - # Extract Fuel Bill - fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text) - data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}" - - # Extract individual address components - postcode = re.search(r"Postcode:\s*(.*?)\nRegion:", text) - # region = re.search(r"Region:\s*(.*?)\nHouse Name:", text) - house_name = re.search(r"House Name:\s*(.*?)\nHouse No:", text) - house_no = re.search(r"House No:\s*(.*?)\nStreet:", text) - street = re.search(r"Street:\s*(.*?)\nLocality:", text) - locality = re.search(r"Locality:\s*(.*?)\nTown:", text) - town = re.search(r"Town:\s*(.*?)\nCounty:", text) - county = re.search(r"County:\s*(.*?)\nProperty Tenure:", text) - - # Clean extracted values and remove any prefixes - address_parts = [ - house_no.group(1).strip() if house_no else "", - house_name.group(1).strip() if house_name else "", - street.group(1).strip() if street else "", - locality.group(1).strip() if locality else "", - town.group(1).strip() if town else "", - county.group(1).strip() if county else "", - postcode.group(1).strip() if postcode else "" - ] - - # Join non-empty parts with a comma - data["Address"] = ", ".join([part for part in address_parts if part]) - data["Postcode"] = postcode.group(1).strip() - - windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL) - windows_text = windows_section.group(1) - window_data = extract_window_age_description(windows_text) - data.update(window_data) - - # Extract Total Number of Doors - total_doors_match = re.search(r"Total Number of Doors\s*(\d+)", text) - data["Total Number of Doors"] = int(total_doors_match.group(1)) - - # Extract Number of Insulated Doors - insulated_doors_match = re.search(r"Number of Insulated Doors\s*(\d+)", text) - data["Number of Insulated Doors"] = int(insulated_doors_match.group(1)) - - # Extract heating system - # Extract Primary Heating Data - # Extract Primary Heating Section - primary_heating_section1 = re.search(r"Main\s*Heating1\s*(.*?)\s*Main\s*Heating2", text, re.DOTALL) - primary_heating_section2 = re.search(r"Main\s*Heating1\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) - primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2 - - primary_text = primary_heating_section.group(1) - - data["Existing Primary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", primary_text).group( - 1).strip() - data["Existing Primary Heating PCDF Reference"] = re.search( - r"PCDF boiler Reference\s*(\d+)", primary_text - ).group(1) - data["Existing Primary Heating Controls"] = re.search( - r"Main Heating Controls\s*(.*?)\n", primary_text - ).group(1).strip() - data["Existing Primary Heating % of Heat"] = int( - re.search(r"Percentage of Heat\s*(\d+)\s*%", primary_text).group(1) - ) - - # Extract Secondary Heating Section - secondary_heating_section = re.search(r"Main\s*Heating2\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) - - if secondary_heating_section is None: - data["Existing Secondary Heating System"] = "" - data["Existing Secondary Heating PCDF Reference"] = "" - data["Existing Secondary Heating Controls"] = "" - data["Existing Secondary Heating % of Heat"] = 0 - - else: - secondary_text = secondary_heating_section.group(1) - - main_heating_code_match_secondary = re.search( - r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text - ) - data["Existing Secondary Heating System"] = main_heating_code_match_secondary.group(1).strip() - data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", - secondary_text).group(1) - second_heating_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text) - data["Existing Secondary Heating Controls"] = ( - second_heating_controls_match.group(1).strip() if second_heating_controls_match else "" - ) - data["Existing Secondary Heating % of Heat"] = int( - re.search(r"Percentage of Heat\s*(\d+)\s*%", secondary_text).group(1) - ) - - # Extract Secondary Heating and Water Heating Codes - secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text) - water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) - - if data["Existing Secondary Heating System"] == "": - data["Secondary Heating Code"] = "" - else: - data["Secondary Heating Code"] = secondary_heating_code_match.group( - 1).strip() if secondary_heating_code_match else "" - - data["Water Heating Code"] = water_heating_code_match.group(1).strip() - - dimensions = extract_building_parts_summary(text) - data.update(dimensions) - - data["Number of Light Fittings"] = int(re.search(r"Total number of light fittings\s*(\d+)", text).group(1)) - data["Number of LEL Fittings"] = int(re.search(r"Total number of L.E.L. fittings\s*(\d+)", text).group(1)) - data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] - - extracted_roof_data = extract_roof_details_summary(text) - main_roof_data = [roof for roof in extracted_roof_data if "Main" in roof["Building Part"]][0] - data["Main Roof Type"] = main_roof_data["Roof Type"] - data["Main Roof Insulation"] = main_roof_data["Roof Insulation"] - data["Main Roof Insulation Thickness"] = main_roof_data["Roof Insulation Thickness"] - - walls_data = extract_wall_details_summary(text) - # Get the main building wall data - main_building_walls = [wall for wall in walls_data if "Main" in wall["Building Part"]][0] - data["Main Wall Type"] = main_building_walls["Wall Type"] - data["Main Wall Insulation"] = main_building_walls["Wall Insulation"] - data["Main Wall Dry-lining"] = main_building_walls["Wall Dry-lining"] - data["Main Wall Thickness"] = main_building_walls["Wall Thickness (mm)"] - data["Main Building Alternative Wall Type"] = main_building_walls["Alternative Wall Type"] - data["Main Building Alternative Wall Insulation"] = main_building_walls["Alternative Wall Insulation"] - data["Main Building Alternative Wall Dry-lining"] = main_building_walls["Alternative Wall Dry-lining"] - data["Main Building Alternative Wall Thickness"] = main_building_walls["Alternative Wall Thickness (mm)"] - - return data - - -def extract_window_age_description(windows_text): - """ - Extracts the most common window age description and its proportion. - - Parameters: - windows_text (str): The text section containing window data. - - Returns: - dict: A dictionary with the most common window age description and its proportion. - """ - # Clean up windows_text by removing line breaks for better pattern matching - windows_text = windows_text.replace("\n", "") - - # Define possible window age descriptions - window_descriptions = [ - "Double post or during 2002", - "Double pre 2002", - "Double with unknown install date", - "Secondary glazing", - "Triple glazing", - "Single glazing", - ] - - # Count occurrences of each description - description_counts = Counter() - for description in window_descriptions: - matches = re.findall(re.escape(description), windows_text) - description_counts[description] = len(matches) - - if not description_counts or not sum(description_counts.values()): - raise ValueError("Failed to extract window data.") - - # Determine the most common description and calculate its proportion - most_common_description, window_count = description_counts.most_common(1)[0] - window_proportion = window_count / sum(description_counts.values()) * 100 - - # Get the second most common and the proportion - if window_proportion == 100: - second_most_common_description = None - second_most_common_proportion = 0 - else: - second_most_common_description, second_window_count = description_counts.most_common(2)[1] - second_most_common_proportion = second_window_count / sum(description_counts.values()) * 100 - - return { - "Window Age Description": most_common_description, - "Window Age Description Proportion (%)": window_proportion, - "Secondary Window Age Description": second_most_common_description, - "Secondary Window Age Description Proportion (%)": second_most_common_proportion, - "Number of Windows": sum(description_counts.values()) - } - - -def extract_building_parts_epr(text): - """ - Extracts building parts and associated dimensions from the provided PDF text. - Each building part (main and extensions) includes floor area, room height, perimeter, and party wall length. - Handles cases where 'Room(s) in Roof area' appears within the part_name with only the Floor Area information. - """ - data = [] - - # Pattern to locate each "Building part" section - building_part_pattern = re.compile( - r"Construction details: Building part: (.*?)\nFloor Area \[m2\] Room Height \[m\] Perimeter \[m\] Party " - r"Wall Length \[m\]\n(.*?)(?=Construction details|Data inputs|$)", - re.DOTALL - ) - - # Extract each building part - for match in building_part_pattern.finditer(text): - part_name = match.group(1).strip() - floor_data = match.group(2) - - # Check for "Room(s) in Roof area" within the part_name - room_in_roof_match = re.search(r"Room\(s\) in Roof area:\s*([\d.]+)", part_name) - if room_in_roof_match: - # Extract Room in Roof area and add it as a separate entry - floor_area = float(room_in_roof_match.group(1)) - # Clean up part name to exclude "Room(s) in Roof area" from the building part name - cleaned_part_name = re.sub(r" - built in.*|Room\(s\) in Roof area:.*", "", part_name).strip() - data.append({ - "Building Part": cleaned_part_name, - "Floor Level": "Room in Roof", - "Floor Area (m2)": floor_area, - "Room Height (m)": None, # Placeholder for missing data - "Perimeter (m)": None, # Placeholder for missing data - "Party Wall Length (m)": None # Placeholder for missing data - }) - else: - # Clean up part name to keep only the descriptor (e.g., "Main" or "1st Extension") - cleaned_part_name = re.sub(r" - built in.*", "", part_name).strip() - - # Pattern to match each floor's measurements in standard cases - floor_pattern = re.compile( - r"(Lowest floor|First floor|Second floor)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" - ) - - # Extract floor details for each building part - for floor_match in floor_pattern.finditer(floor_data): - floor_level = floor_match.group(1) - floor_area = float(floor_match.group(2)) - room_height = float(floor_match.group(3)) - perimeter = float(floor_match.group(4)) - party_wall_length = float(floor_match.group(5)) - - # Append to data - data.append({ - "Building Part": cleaned_part_name, - "Floor Level": floor_level, - "Floor Area (m2)": floor_area, - "Room Height (m)": room_height, - "Perimeter (m)": perimeter, - "Party Wall Length (m)": party_wall_length - }) - - # Aggregated data calculation - main_building = [part for part in data if "Main" in part["Building Part"]] - first_extension = [part for part in data if "1st Extension" in part["Building Part"]] - dimensions = { - "Total Floor Area (m2)": sum([part["Floor Area (m2)"] for part in data]), - "Total Ground Floor Area (m2)": sum( - [part["Floor Area (m2)"] for part in data if "Lowest floor" in part["Floor Level"]] - ), - "RIR Floor Area": sum( - [part["Floor Area (m2)"] for part in data if "Room in Roof" in part["Floor Level"]] - ), - "Main Building Wall Area (m2)": sum( - [x["Perimeter (m)"] * x["Room Height (m)"] for x in main_building if - x["Perimeter (m)"] and x["Room Height (m)"]] - ), - "First Extension Wall Area (m2)": sum( - [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extension if - x["Perimeter (m)"] and x["Room Height (m)"]] - ) if first_extension else 0, - } - - return dimensions - - -def extract_building_parts_summary(text): - """ - Extracts building parts and associated dimensions from the summary report PDF. - This includes Main Property, multiple extensions if they exist, and Room in Roof areas. - """ - data = [] - - # Locate the Dimensions section - dimensions_section = re.search( - r"Dimensions:\s*Dimension type: Internal\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL - ) - if not dimensions_section: - dimensions_section = re.search( - r"Dimensions:\s*Dimension type: External\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL - ) - if not dimensions_section: - raise ValueError("Failed to locate dimensions section in the text.") - - dimensions_text = dimensions_section.group(1) - - # Pattern to extract each building part, starting from Main Property and including extensions - building_part_pattern = re.compile( - r"(Main Property|\d+(?:st|nd|rd|th) Extension)\s*" - r"(.*?)(?=\d+(?:st|nd|rd|th) Extension|5\.0 Conservatory|$)", - re.DOTALL - ) - - # Loop through each building part match, including Main Property and extensions - for match in building_part_pattern.finditer(dimensions_text): - part_name = match.group(1) - floor_data = match.group(2) - - # Pattern to extract floor details: Floor Level, Floor Area, Room Height, Perimeter, Party Wall Length - floor_pattern = re.compile( - r"(1st Floor|Lowest Floor|Second floor):\s*([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" - ) - - # Extract data for each floor within the building part - for floor_match in floor_pattern.finditer(floor_data): - floor_level = floor_match.group(1) - floor_area = float(floor_match.group(2)) - room_height = float(floor_match.group(3)) - perimeter = float(floor_match.group(4)) - party_wall_length = float(floor_match.group(5)) - - # Append to data list - data.append({ - "Building Part": part_name, - "Floor Level": floor_level, - "Floor Area (m2)": floor_area, - "Room Height (m)": room_height, - "Perimeter (m)": perimeter, - "Party Wall Length (m)": party_wall_length - }) - - # Check specifically for "Room(s) in Roof" entries, which only have Floor Area - room_in_roof_pattern = re.compile(r"Room\(s\) in Roof:\s*([\d.]+)") - room_in_roof_match = room_in_roof_pattern.search(floor_data) - if room_in_roof_match: - floor_area = float(room_in_roof_match.group(1)) - data.append({ - "Building Part": part_name, - "Floor Level": "Room in Roof", - "Floor Area (m2)": floor_area, - "Room Height (m)": None, # Placeholder for missing data - "Perimeter (m)": None, # Placeholder for missing data - "Party Wall Length (m)": None # Placeholder for missing data - }) - - # Calculate aggregated dimensions - main_property = [part for part in data if "Main Property" in part["Building Part"]] - first_extensions = [part for part in data if "1st Extension" in part["Building Part"]] - dimensions = { - "Total Floor Area (m2)": sum([part["Floor Area (m2)"] for part in data]), - "Total Ground Floor Area (m2)": sum( - [part["Floor Area (m2)"] for part in data if "Lowest Floor" in part["Floor Level"]] - ), - "RIR Floor Area": sum( - [part["Floor Area (m2)"] for part in data if "Room in Roof" in part["Floor Level"]] - ), - "Main Building Wall Area (m2)": sum([x["Perimeter (m)"] * x["Room Height (m)"] for x in main_property if - x["Perimeter (m)"] and x["Room Height (m)"]]), - "First Extension Wall Area (m2)": sum( - [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extensions if - x["Perimeter (m)"] and x["Room Height (m)"]] - ), - } - - return dimensions - - -def extract_roof_details_epr(text): - """ - Extracts roof type, insulation, and insulation thickness for each building part - in the provided EPR PDF text. - """ - # Define data structure to hold results - roof_data = [] - - # Locate each building part section - building_part_pattern = re.compile( - r"Construction details: Building part: (.*?)\n(.*?)(?=Conservatory|Construction details|$)", - re.DOTALL - ) - - # Extract each building part's data, including roof details - for match in building_part_pattern.finditer(text): - part_name = match.group(1).strip() - - # Clean up the building part name - cleaned_part_name = re.sub(r" - built in.*|Room\(s\) in Roof area:.*", "", part_name).strip() - - part_details = match.group(2) - - # Extract Roof Type, Roof Insulation, and Roof Insulation Thickness - roof_type_match = re.search(r"Roof Type:\s*(.*?)(?=\n|$)", part_details) - roof_insulation_match = re.search(r"Roof Insulation:\s*(.*?)(?=\n|$)", part_details) - roof_insulation_thickness_match = re.search(r"Roof Insulation Thickness:\s*(.*?)(?=\n|$)", part_details) - - # Store results for this building part - roof_data.append({ - "Building Part": cleaned_part_name, - "Roof Type": roof_type_match.group(1).strip() if roof_type_match else None, - "Roof Insulation": roof_insulation_match.group(1).strip() if roof_insulation_match else None, - "Roof Insulation Thickness": roof_insulation_thickness_match.group( - 1).strip() if roof_insulation_thickness_match else None, - }) - - return roof_data - - -def extract_roof_details_summary(text): - """ - Extracts roof type, insulation, and insulation thickness for each building part - in the 8.0 Roofs section of the summary report. - """ - # Define data structure to hold results - roof_data = [] - - # Locate the entire 8.0 Roofs section - roof_section_match = re.search(r"8\.0 Roofs:\n(.*?)(?=\n9\.0 Floors:|$)", text, re.DOTALL) - if not roof_section_match: - return roof_data # Return empty if no roof section is found - - # Extract the roof section and append "9.0 Floors:" as the boundary - roof_section = roof_section_match.group(1).strip() + "\n9.0 Floors:" - - # Define pattern to match each building part's roof entry - building_part_pattern = re.compile( - r"(Main Property|1st Extension|2nd Extension|[\w\s]+)\n" # Matches each building part label - r"Type\s+(.*?)(?=\n(?:Insulation|9\.0 Floors:|[A-Z]))" # Matches Roof Type until the next field, label, or end - r"(?:\nInsulation\s+(.*?)(?=\n(?:Insulation Thickness|9\.0 Floors:|[A-Z])))?" # Optional Insulation - r"(?:\nInsulation Thickness\s+(.*?)(?=\n(?:9\.0 Floors:|[A-Z])))?", # Optional Insulation Thickness - re.DOTALL - ) - - # Extract each building part's data - for match in building_part_pattern.finditer(roof_section): - part_name = match.group(1).strip() # Building part label - roof_type = match.group(2).strip() # Roof Type - roof_insulation = match.group(3).strip() if match.group(3) else None # Optional Insulation - roof_insulation_thickness = match.group(4).strip() if match.group(4) else None # Optional Thickness - - # Cleaning to handle annoying cases when it comes out like this: - # 'A Another dwelling above\n1st Extension' - if roof_type.startswith("A Another dwelling above"): - roof_type = "A Another dwelling above" - - # Store results for this building part - roof_data.append({ - "Building Part": part_name, - "Roof Type": roof_type, - "Roof Insulation": roof_insulation, - "Roof Insulation Thickness": roof_insulation_thickness, - }) - - return roof_data - - -def extract_wall_details_epr(text): - """ - Extracts wall type, insulation, dry-lining, and thickness for each building part - in the provided EPR PDF text. - """ - # Define data structure to hold results - wall_data = [] - - # Locate each building part section - building_part_pattern = re.compile( - r"Construction details: Building part: (.*?)\n(.*?)(?=Conservatory|Construction details|$)", - re.DOTALL - ) - - # Extract each building part's data, including wall details - for match in building_part_pattern.finditer(text): - part_name = match.group(1).strip() - - # Clean up the building part name - cleaned_part_name = re.sub(r" - built in.*|Room\(s\) in Roof area:.*", "", part_name).strip() - - part_details = match.group(2) - - # Extract Wall Type, Wall Insulation, Wall Dry-lining, and Wall Thickness - wall_type_match = re.search(r"Wall Type:\s*(.*?)(?=\n|$)", part_details) - wall_insulation_match = re.search(r"Wall Insulation:\s*(.*?)(?=\n|$)", part_details) - wall_drylining_match = re.search(r"Wall Dry-lining:\s*(.*?)(?=\n|$)", part_details) - wall_thickness_match = re.search(r"Wall Thickness:\s*(\d+)(?=\n|$)", part_details) - - # Extract Alternative Wall information if available - alt_wall_type_match = re.search(r"Alternative Wall Type:\s*(.*?)(?=\n|$)", part_details) - alt_wall_insulation_match = re.search(r"Alternative Wall Insulation:\s*(.*?)(?=\n|$)", part_details) - alt_wall_drylining_match = re.search(r"Alternative Wall Dry-lining:\s*(.*?)(?=\n|$)", part_details) - alt_wall_thickness_match = re.search(r"Alternative Wall Thickness:\s*(\d+)(?=\n|$)", part_details) - - # Store results for this building part - wall_data.append({ - "Building Part": cleaned_part_name, - "Wall Type": wall_type_match.group(1).strip() if wall_type_match else None, - "Wall Insulation": wall_insulation_match.group(1).strip() if wall_insulation_match else None, - "Wall Dry-lining": wall_drylining_match.group(1).strip() if wall_drylining_match else None, - "Wall Thickness": int(wall_thickness_match.group(1)) if wall_thickness_match else None, - "Alternative Wall Type": alt_wall_type_match.group(1).strip() if alt_wall_type_match else None, - "Alternative Wall Insulation": alt_wall_insulation_match.group( - 1).strip() if alt_wall_insulation_match else None, - "Alternative Wall Dry-lining": alt_wall_drylining_match.group( - 1).strip() if alt_wall_drylining_match else None, - "Alternative Wall Thickness": int(alt_wall_thickness_match.group(1)) if alt_wall_thickness_match else None, - }) - - return wall_data - - -def extract_epr(pdf_path): - """ - Extracts specific data from an Energy Report (EPR) PDF file. - """ - - data = { - "Address": None, - "Postcode": None, - "Current SAP Rating": None, - "Current EPC Band": None, - "Primary Energy Use (kWh/yr)": None, - "Primary Energy Use Intensity (kWh/m2/yr)": None, - "Number of Storeys": None, - "Main Building Age Band": None, - "Fuel Bill": None, - "Window Age Description": None, - "Window Age Description Proportion (%)": None, - "Secondary Window Age Description": None, - "Secondary Window Age Description Proportion (%)": None, - "Number of Windows": None, - "Total Number of Doors": None, - "Number of Insulated Doors": None, - "Existing Primary Heating System": None, - "Existing Primary Heating PCDF Reference": None, - "Existing Primary Heating Controls": None, - "Existing Primary Heating % of Heat": None, - "Existing Secondary Heating System": None, - "Existing Secondary Heating PCDF Reference": None, - "Existing Secondary Heating Controls": None, - "Existing Secondary Heating % of Heat": None, - "Secondary Heating Code": None, - "Water Heating Code": None, - 'Total Floor Area (m2)': None, - 'Total Ground Floor Area (m2)': None, - 'RIR Floor Area': None, - 'Main Building Wall Area (m2)': None, - 'First Extension Wall Area (m2)': None, - "Number of Light Fittings": None, - "Number of LEL Fittings": None, - "Number of fittings needing LEL": None, - "Main Roof Type": None, - "Main Roof Insulation": None, - "Main Roof Insulation Thickness": None, - "Main Wall Type": None, - "Main Wall Insulation": None, - "Main Wall Dry-lining": None, - "Main Wall Thickness": None, - "Main Building Alternative Wall Type": None, - "Main Building Alternative Wall Insulation": None, - "Main Building Alternative Wall Dry-lining": None, - "Main Building Alternative Wall Thickness": None, - "Main Fuel": None - } - - with open(pdf_path, "rb") as file: - reader = PyPDF2.PdfReader(file) - text = "" - for page in reader.pages: - text += page.extract_text() - - # Extract Address - address_match = re.search(r"ENERGY REPORT\nDwelling Address\s*(.*?)\s*\nReference", text, re.DOTALL) - data["Address"] = address_match.group(1).strip() - data["Postcode"] = data["Address"].split(",")[-1].strip() - - # Extract Current and Potential SAP ratings - sap_match = re.search(r"GG \(1-20\)\s*(\d{1,2})\s*(\d{1,2})", text) - if sap_match is None: - # Handles the older format of the elmhurst EPR - # The text will look something like this: - # Least energy efficient - higher running costsD 61 - we extract D 61 - sap_match = re.search( - r"(?P[A-G])\s(?P\d{1,3})(?P[A-G])\s(?P\d{1,3})", - text) - data["Current EPC Band"] = sap_match.group("current_epc") - data["Current SAP Rating"] = int(sap_match.group("current_sap")) - else: - current_sap, _ = int(sap_match.group(1)), int(sap_match.group(2)) - data["Current SAP Rating"] = current_sap - - # Extract the primary energy use intensity - additional_rating_match = re.search(r"Additional ratings for your home\s*([\d.]+)", text) - if additional_rating_match: - data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(additional_rating_match.group(1)) - else: - # Handles the older format of the Elmhurst EPR - primary_energy_match = re.search(r"actual consumption\.\n(?P\d+)", text) - data["Primary Energy Use (kWh/yr)"] = int(primary_energy_match.group("primary_energy")) - # We calculate the primary energy use intensity by dividing by floor area - floor_area = re.search(r"Total Floor Area\s(?P\d+)\s?m2", text).group("floor_area") - data["Primary Energy Use Intensity (kWh/m2/yr)"] = data["Primary Energy Use (kWh/yr)"] / int(floor_area) - - # Extract age band - age_band_match = re.search( - r"Building part:\s*Main\s*-\s*built in\s*(?:[A-Z]\s*)?(\d{4}-\d{4}|before \d{4}|\d{4} onwards)", - text - ) - - data["Main Building Age Band"] = age_band_match.group(1) - - # Extract Number of Storeys - storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) - data["Number of Storeys"] = int(storeys_match.group(1)) - - # Extract Fuel Bill - fuel_bill_match = re.search(r"TOTAL\s*£(\d+)", text) - data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}" - - # Extract Total Number of Doors - total_doors_match = re.search(r"Total Doors:\s*(\d+)", text) - data["Total Number of Doors"] = int(total_doors_match.group(1)) - - # Extract Number of Insulated Doors - insulated_doors_match = re.search(r"Insulated Doors:\s*(\d+)", text) - data["Number of Insulated Doors"] = int(insulated_doors_match.group(1)) - - # Extract Primary Heating Section (Main Heating 1) - primary_heating_section1 = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Main\s*Heating\s*2", text, re.DOTALL) - # We may not have a secondary heating - primary_heating_section2 = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Secondary\s*Heating", text, re.DOTALL) - primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2 - primary_text = primary_heating_section.group(1) - - data["Existing Primary Heating System"] = re.search( - r"Main Heating Code\s*(.*?)\n", primary_text - ).group(1).strip() - data["Existing Primary Heating PCDF Reference"] = re.search( - r"PCDF boiler Reference\s*(\d+)", primary_text - ).group(1) - data["Existing Primary Heating Controls"] = re.search( - r"Main Heating Controls\s*(.*?)\n", primary_text - ).group(1).strip() - data["Existing Primary Heating % of Heat"] = int( - re.search(r"Percentage of Heat\s*(\d+)\s*%?", primary_text).group(1) - ) - - # Extract Secondary Heating Section (Main Heating 2) - secondary_heating_section = re.search(r"Main\s*Heating\s*2\s*(.*?)\s*Secondary Heating", text, re.DOTALL) - if secondary_heating_section is None: - data["Existing Secondary Heating System"] = "" - data["Existing Secondary Heating PCDF Reference"] = "" - data["Existing Secondary Heating Controls"] = "" - data["Existing Secondary Heating % of Heat"] = 0 - - else: - secondary_text = secondary_heating_section.group(1) - - main_heating_code_match_secondary = re.search( - r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text - ) - data["Existing Secondary Heating System"] = main_heating_code_match_secondary.group(1).strip() - - data["Existing Secondary Heating PCDF Reference"] = re.search( - r"PCDF boiler Reference\s*(\d+)", secondary_text - ).group(1) - - if data["Existing Secondary Heating System"] == "": - data["Existing Secondary Heating Controls"] = "" - else: - # Might not have heating controls on 2nd system - secondary_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text) - data["Existing Secondary Heating Controls"] = ( - secondary_controls_match.group(1).strip() if secondary_controls_match else "" - ) - data["Existing Secondary Heating % of Heat"] = int( - re.search(r"Percentage of Heat\s*(\d+)\s*%?", secondary_text).group(1) - ) - - # Extract Secondary Heating and Water Heating Codes - secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text) - water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) - - if data["Existing Secondary Heating System"] == "": - data["Secondary Heating Code"] = "" - else: - data["Secondary Heating Code"] = secondary_heating_code_match.group( - 1).strip() if secondary_heating_code_match else "" - data["Water Heating Code"] = water_heating_code_match.group(1).strip() - - # Extract Windows information - windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL) - if windows_section: - windows_text = windows_section.group(1) - window_data = extract_window_age_description(windows_text) - data.update(window_data) - - building_parts = extract_building_parts_epr(text) - data.update(building_parts) - - # Get number of lighting outlets and number of fittings needing LEL - lighting_fittings_match = re.search(r"Total number of light fittings\s*(\d+)", text) - data["Number of Light Fittings"] = int(lighting_fittings_match.group(1)) - lel_fittings_match = re.search(r"Total number of L.E.L. fittings\s*(\d+)", text) - data["Number of LEL Fittings"] = int(lel_fittings_match.group(1)) - data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] - - roof_details = extract_roof_details_epr(text) - # Get from the main building - main_roof_details = [r for r in roof_details if "Main" in r["Building Part"]] - data["Main Roof Type"] = main_roof_details[0]["Roof Type"] - data["Main Roof Insulation"] = main_roof_details[0]["Roof Insulation"] - data["Main Roof Insulation Thickness"] = main_roof_details[0]["Roof Insulation Thickness"] - - wall_details = extract_wall_details_epr(text) - main_wall_details = [w for w in wall_details if "Main" in w["Building Part"]][0] - data["Main Wall Type"] = main_wall_details["Wall Type"] - data["Main Wall Insulation"] = main_wall_details["Wall Insulation"] - data["Main Wall Dry-lining"] = main_wall_details["Wall Dry-lining"] - data["Main Wall Thickness"] = main_wall_details["Wall Thickness"] - data["Main Building Alternative Wall Type"] = main_wall_details["Alternative Wall Type"] - data["Main Building Alternative Wall Insulation"] = main_wall_details["Alternative Wall Insulation"] - data["Main Building Alternative Wall Dry-lining"] = main_wall_details["Alternative Wall Dry-lining"] - data["Main Building Alternative Wall Thickness"] = main_wall_details["Alternative Wall Thickness"] - - return data - - -def detect_report_type(pdf_path, pdf_file): - """ - Detects the type of report based on content or filename. - :param pdf_path: String path to the PDF file - :param pdf_file: String name of the PDF file - :return: String type of the report ("epr", "summary", or None) - """ - # Attempt to read the first page of the PDF to determine type - with open(pdf_path, "rb") as file: - # This code raises some warnings like Multiple definitions in dictionary at byte 0x1ab for key /Filter - # This is because the pdf is irregular. We could possibly try a library like fitz to handle this - reader = PyPDF2.PdfReader(file) - first_page_text = reader.pages[0].extract_text() if reader.pages else "" - n_pages = len(reader.pages) - - if is_energy_report(first_page_text) and n_pages > 3: - # The EPR should have more than 3 pages - return "epr" - elif is_energy_report(first_page_text) and n_pages <= 3: - # This is a shortened version of the EPR which isn't massively useful - return "short_form_epr" - elif "summary" in pdf_file.lower() or is_summary_report(first_page_text): - return "summary" - elif is_condition_report(first_page_text): - return "condition" - - return None - - -def extract_retrofit_pdfs(data_folder_path): - """ - Handles extraction from a retrofit data folder if it exists and has content. - Prioritizes extracting data from an EPR if both EPR and summary report are present. - """ - retrofit_files = [f for f in os.listdir(data_folder_path) if f.endswith(".pdf")] - report_types = {"epr": None, "summary": None} - - # First, identify the types of reports available - for pdf_file in retrofit_files: - pdf_path = os.path.join(data_folder_path, pdf_file) - report_type = detect_report_type(pdf_path, pdf_file) - - if report_type == "epr": - report_types["epr"] = pdf_path - elif report_type == "summary": - report_types["summary"] = pdf_path - - # Stop checking further if both EPR and summary are found - if report_types["epr"] and report_types["summary"]: - break - - # Extract data based on report availability and priority - if report_types["epr"]: - return extract_epr(report_types["epr"]) - elif report_types["summary"]: - return extract_summary_report(report_types["summary"]) - - # If no relevant PDF is found, return None - return None - - -def is_energy_report(text): - """ - Determines if the provided text indicates that the PDF is an Energy Report. - Returns True if the text contains 'Energy Report'. - """ - return text.startswith("ENERGY REPORT") - - -def is_summary_report(text): - """ - Determines if the provided text indicates that the PDF is a Summary Report. - """ - return text.startswith("Summary Information") - - -def detect_and_parse_report(pdf_path, pdf_file): - """ - Detects the type of report and extracts the relevant data. - :param pdf_path: String path to the PDF file - :param pdf_file: String name of the PDF file - :return: - """ - # Attempt to read the first page of the PDF to determine type - with open(pdf_path, "rb") as file: - reader = PyPDF2.PdfReader(file) - first_page_text = reader.pages[0].extract_text() if reader.pages else "" - - if is_energy_report(first_page_text): - # Treat this as an Energy Report - return extract_epr(pdf_path) - elif "summary" in pdf_file.lower() or is_summary_report(first_page_text): - # Treat this as a Summary Report - return extract_summary_report(pdf_path) - elif is_condition_report(first_page_text): - return None - else: - raise NotImplementedError("Implement me") - - -def is_condition_report(text): - """ - Determines if the provided text indicates that the PDF is a Condition Report. - """ - return text.startswith("OsmosisACDNEWPAS2035ConditionReport") or text.startswith("OsmosisACDPAS2035ConditionReport") - - -def main(): - """ - This code prepares the data for the Warm Homes: Social Housing Fund Wave 3, for Stonewater. - """ - # List only directories in the specified FILE_PATH - survey_folders = [] - - # Loop over each survey folder and list its contents - for i in range(1, NUM_FOLDERS + 1): - folder_path = os.path.join(CUSTOMER_FOLDER_PATH, f"StonewaterSurveys_{i}") - if os.path.isdir(folder_path): # Check if folder exists - folder_contents = [os.path.join(f"StonewaterSurveys_{i}", file) for file in os.listdir(folder_path)] - survey_folders.extend(folder_contents) # Append contents to the master list - - # Get rid of .DS_Store files - survey_folders = [folder for folder in survey_folders if not folder.endswith(".DS_Store")] - - extracted_data = [] - for survey_folder in tqdm(survey_folders): - survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder) - - # List the folders inside of the survey folder - survey_subfolders = [name for name in os.listdir(survey_folder_path) - if os.path.isdir(os.path.join(survey_folder_path, name))] - - # Check if there's a "retrofit assessment" folder - retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None) - - ra_folder = next( - (name for name in survey_subfolders if "ra coordinator info" in name.lower() or "ra info" in name.lower()), - None - ) - - # If retrofit assessment folder exists, check if it has content - if retrofit_folder or ra_folder: - if retrofit_folder: - retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder) - else: - retrofit_folder_path = os.path.join(survey_folder_path, ra_folder) - - # Check if everything inside is a sub-folder and the number of folders is 2 - items = [item for item in os.listdir(retrofit_folder_path) if item != '.DS_Store'] - all_folders = [os.path.isdir(os.path.join(retrofit_folder_path, item)) for item in items] - if all(all_folders) and len(all_folders) == 2 and "Property Pics" in items: - # Get the folder that isn't Property Pics - retrofit_folder_path = os.path.join( - retrofit_folder_path, [item for item in items if item != "Property Pics"][0] - ) - - if os.listdir(retrofit_folder_path): # If not empty - summary_data = extract_retrofit_pdfs(data_folder_path=retrofit_folder_path) - if summary_data: - summary_data = { - "survey_folder": survey_folder, - **summary_data, - } - extracted_data.append(summary_data) - continue - else: - # Then we have an empty Retrofit Assessment folder - continue - - # If no retrofit folder or it was empty, check files in survey_folder - - summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path) - if not summary_data: - if len(survey_subfolders) == 1: - survey_folder_path = os.path.join(survey_folder_path, survey_subfolders[0]) - summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path) - - if summary_data: - summary_data = { - "survey_folder": survey_folder, - **summary_data, - } - extracted_data.append(summary_data) - - extracted_data = pd.DataFrame(extracted_data) - - extracted_data["Primary Energy Use (kWh/yr)"] = ( - extracted_data["Primary Energy Use Intensity (kWh/m2/yr)"] * extracted_data["Total Floor Area (m2)"] - ) - extracted_data["Current SAP Rating"] = extracted_data["Current SAP Rating"].astype(int) - extracted_data["Current EPC Band"] = extracted_data["Current SAP Rating"].apply(sap_to_epc) - - # Remove some definite duplicates - dupes = extracted_data[extracted_data["Address"].duplicated()]["Address"] - dupes = extracted_data[extracted_data["Address"].isin(dupes)] - dupes = dupes.sort_values("Address") - # Get all of the folders that end with ROSS - to_drop = dupes[dupes["survey_folder"].str.endswith("ROSS")]["survey_folder"].unique().tolist() - - extracted_data = extracted_data[ - ~extracted_data["survey_folder"].isin( - [ - "StonewaterSurveys_10/4 Beech Road, LUTON, LU1 1DP ROSS", - "StonewaterSurveys_2/135 Runley Road, LUTON, LU1 1TX ROSS", - "StonewaterSurveys_13/7 Saxon Road, LUTON, LU3 1JR ROSS" - ] + to_drop - ) - ] - - # We now merge on the coordinator data so that against each property, we can map the measures - # TODO: Get the pre & post primary energy numbers - # TODO: Make sure the numbers are going down - - retrofit_packages_board = pd.read_excel( - os.path.join( - CUSTOMER_FOLDER_PATH, - "Stonewater_SHDF_3_0_Board_work_in_progress_-_Operations_1732034933 Final 19.11.24.xlsx" - ), - header=4 - ) - retrofit_packages_board = retrofit_packages_board[~pd.isnull(retrofit_packages_board["Name"])] - # Take just the rows that have been surveyed - retrofit_packages_board = retrofit_packages_board[ - retrofit_packages_board["RA"].isin(["Invoiced", "Completed"]) - ] - # populated_primary_energy = retrofit_packages_board[ - # ~pd.isnull(retrofit_packages_board['BASE Primary energy (13a-272)']) - # ] - # - # z = populated_primary_energy[ - # populated_primary_energy['POST Primary energy (13a - 272)'] > populated_primary_energy[ - # 'BASE Primary energy (13a-272)'] - # ] - # - # all(populated_primary_energy['POST Primary energy (13a - 272)'] <= populated_primary_energy[ - # 'BASE Primary energy (13a-272)']) - - # Replace \n with "" - extracted_data["Postcode"] = extracted_data["Postcode"].str.replace("\n", "") - - manual_filters = { - "Flat 21 Walmer Street": "StonewaterSurveys_14/91-1-Flat 21 Walmer Street-HR4 9JD", - "6 Cornewall Close": "StonewaterSurveys_14/aa 6, Cornewall Close, Moccas, HEREFORD, HR2 9LG", - "2 Bromyard Road": "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ", - 'Flat 18, 1 Raglan Court': "StonewaterSurveys_13/60-3-18 Raglan Court, 1 Raglan Court-MK41 8QT", - '14 Raglan Court, 1 Devizes Avenue': 'StonewaterSurveys_12/55-3-14 Raglan Court, Devizes Avenue-MK41 8QT', - '19 South Road': 'StonewaterSurveys_4/19 The Oaks, South Road, SMETHWICK, B67 7BY', - 'Flat 12 Pelican Lane': 'StonewaterSurveys_1/121-3-Flat 12 Lynton Court, Pelican Lane-RG14 1NN', - 'Flat C, 44 St Leonards Avenue': 'StonewaterSurveys_11/427-2-44c St. Leonards Avenue-MK42 0RB', - '16 The Crescent, Kington': 'StonewaterSurveys_9/360-3-16 The Crescent-HR5 3AS', - '2 School Lane, Leominster': 'StonewaterSurveys_5/224-1-2 School Lane-HR6 8AA', - '14 South Road': 'StonewaterSurveys_2/14 The Oaks, South Road, SMETHWICK, B67 7BY', - '1 Groves Street': 'StonewaterSurveys_4/19-5-1 Groves Street-SN2 2BW', - # '2 Sorrell Place': '', - # '72 St Ives Road': '', - # '1 The Close, Burton Gardens': '', - # '102 Cheaton Close': '', - # 'Flat 16 Spring Gardens': '', - # '4 Apple Close': '', - # '25 Folly Lane': '', - '2 Calshot Walk': 'StonewaterSurveys_3/156-3-2 Calshot Walk-MK41 8QS', - '21 Constitution Hill': 'StonewaterSurveys_1/112-11-21 Constitution Hill-BH14 0PX', - '22 Constitution Hill': 'StonewaterSurveys_4/185-8-22 Constitution Hill-BH14 0PX', - '2 Marches Cottages, School Lane, Leominster': 'StonewaterSurveys_5/224-1-2 School Lane-HR6 8AA', - '26, Copthorn House, Brighton Road': 'StonewaterSurveys_15/133-1-26 Brighton Road-KT20 6BQ', - '4, Old St Marys, Ripley Lane': "StonewaterSurveys_15/433-3-4 Ripley Lane-KT24 6JG", - '1 Nelson House, Short Street': 'StonewaterSurveys_15/89-2-1 Short Street-GU11 1HX', - "18 Nelson House, Short Street": 'StonewaterSurveys_15/25-3- 18 Short Street- GU11 1HX', - '3 Nelson House, Short Street': 'StonewaterSurveys_2/138-1-3 Short Street-GU11 1HX', - '16, Copthorn House, Brighton Road': 'StonewaterSurveys_13/78-3-16 Brighton Road-KT20 6BQ', - '20 Nelson House, Short Street': 'StonewaterSurveys_15/89-1-20 Short Street-GU11 1HX', - '7 Croft Street': 'StonewaterSurveys_8/333-2-7 Croft Street-HR6 8LA' - } - - # We now match this retrofit packages board to the extracted data - matching_lookup = [] - for _, home in tqdm(retrofit_packages_board.iterrows(), total=len(retrofit_packages_board)): - - # Handle the case that has the wrong postcode in the asset data - if home["Name"] in manual_filters: - filtered = extracted_data[extracted_data["survey_folder"] == manual_filters[home["Name"]]].copy() - else: - filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()].copy() - - # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces - to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains( - home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False - ) - if to_filter.sum() == 0: - to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.replace(",", "").str.replace(".", - "").str.contains( - home["Name"].replace(r"[^\w\s]", "").replace(",", ""), case=False - ) - filtered = filtered[to_filter] - - if filtered.empty: - continue - - if filtered.shape[0] == 1: - matching_lookup.append( - { - "survey_folder": filtered["survey_folder"].values[0], - "Address ID": home["Address ID"], - "Name": home["Name"] - } - ) - continue - - # home["Name"] should be contained in the survey_folder - filtered = filtered[filtered["survey_folder"].str.contains(home["Name"], case=False)] - # We have an edge case wher some properties have two outputs in Sharepoint - if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ": - raise Exception("Fix me1") - # filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"] - - if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB': - raise Exception("Fix me2") - # filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"] - - if home["Name"] == '2 Bromyard Road' and home["Postcode"] == 'WR15 8BZ': - filtered = filtered[filtered["survey_folder"] == "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ"] - - if filtered.empty: - continue - if filtered.shape[0] != 1: - raise Exception("something went wrong") - - matching_lookup.append( - { - "survey_folder": filtered["survey_folder"].values[0], - "Address ID": home["Address ID"], - "Name": home["Name"] - } - ) - - matching_lookup = pd.DataFrame(matching_lookup) - # Find Osmosis IDs that are in the packages board but not in the matching looking - missing_ids = set(retrofit_packages_board["Address ID"]) - set(matching_lookup["Address ID"]) - missing_ids = list(missing_ids) - if missing_ids: - # We check that the missing ids have no data yet - # missed = retrofit_packages_board[retrofit_packages_board["Address ID"].isin(missing_ids)] - # missed[["Name", "Postcode", "Archetype ID", "Arch. Group Rank"]].to_csv( - # CUSTOMER_FOLDER_PATH + "/missed_debugging.csv") - - if len(missing_ids) != 1: - raise Exception("Unacceptable number of missings") - - if matching_lookup["Address ID"].duplicated().sum(): - raise Exception("Duplicate Address IDs") - - if matching_lookup["survey_folder"].duplicated().sum(): - raise Exception("Duplicate survey folders") - - measure_columns = [ - 'Main Wall Insulation', - 'Secondary Wall Insulation', - 'Loft insulation', - 'Flat Roof', - 'Room in Roof', - 'Window Upgrade', - 'Door Upgrade', - 'Ventilation', - 'Main Heating', - 'Water Heating', - 'Heating Controls', - 'Solar PV', - 'Other measures' - ] - - # We should end up with a 1:1 mapping between the Osm. ID and the survey folder - stonewater_data = extracted_data.merge(matching_lookup, on="survey_folder", how="inner").merge( - retrofit_packages_board[ - [ - "Name", - "RA", - "Address ID", - "Archetype ID", - "Arch. Group Rank", - "Actual SAP Band", - "Actual SAP Rating", - "Modelled SAP Band", - "Modelled SAP Rating", - "Package Ref", - ] + measure_columns - ], - on=["Address ID", "Name"], - how="left" - ) - - if stonewater_data["Address ID"].duplicated().sum(): - raise Exception("Duplicate Address IDs") - # Create a section for costs - for measure in measure_columns: - stonewater_data[f"Cost of {measure}"] = None - - stonewater_data["Total Cost of Measures"] = None - stonewater_data["Contingency Cost"] = None - stonewater_data["Total Cost of Measures inc Contingency"] = None - - # We've appended the recommended packages and modelled SAP ratings to the data - # We also want to append the windows data - windows_data = pd.read_excel( - os.path.join( - CUSTOMER_FOLDER_PATH, - "Window data included AP Copy Stonewater SHDF_3_0_Board Triage Master Filtered 26.07.24.xlsx" - ), - header=12 - ) - - windows_data = windows_data[windows_data["Address ID"] != "Address ID"] - windows_data = windows_data[~pd.isnull(windows_data["Address ID"])] - - # We get a lookup id of Osm.ID and when the windows were fitted - windows_data = windows_data[ - ["Address ID", "Window attributes - Fitted/renewed date", - "Parent Asset Window attributes - Fitted/renewed date"] - ] - # Convert to string for the moment - windows_data["Parent Asset Window attributes - Fitted/renewed date"] = windows_data[ - "Parent Asset Window attributes - Fitted/renewed date" - ].astype(str) - # Create a single date column - windows_data["Fitted/renewed date"] = np.where( - pd.notnull(windows_data["Window attributes - Fitted/renewed date"]), - windows_data["Window attributes - Fitted/renewed date"], - windows_data["Parent Asset Window attributes - Fitted/renewed date"] - ) - # Convert to a date - windows_data["Fitted/renewed date"] = pd.to_datetime(windows_data["Fitted/renewed date"]) - # Calculate the number of years since something was done on the windows - windows_data["Years since fitted/renewed"] = (pd.Timestamp.now() - windows_data[ - "Fitted/renewed date"]).dt.days / 365 - - stonewater_data["Package Includes Windows"] = ~pd.isnull(stonewater_data["Window Upgrade"]) - windows_data["Address ID"] = windows_data["Address ID"].astype(float) - stonewater_data = stonewater_data.merge(windows_data, on="Address ID", how="left") - stonewater_data = stonewater_data.sort_values("Archetype ID", ascending=True) - - if stonewater_data["Address ID"].duplicated().sum(): - raise Exception("Duplicate Address IDs") - - for c in [ - 'Window attributes - Fitted/renewed date', - 'Parent Asset Window attributes - Fitted/renewed date', - 'Fitted/renewed date' - ]: - stonewater_data[c] = stonewater_data[c].astype(str) - - # FIll the primary energy numbers from the excel - stonewater_data = stonewater_data.merge( - retrofit_packages_board[ - [ - "Name", "Address ID", "BASE Primary energy (13a-272)", "POST Primary energy (13a - 272)" - ] - ], - on=["Address ID", "Name"], - how="left" - ) - stonewater_data["Primary Energy Use (kWh/yr)"] = np.where( - pd.isnull(stonewater_data["Primary Energy Use (kWh/yr)"]), - stonewater_data["BASE Primary energy (13a-272)"], - stonewater_data["Primary Energy Use (kWh/yr)"] - ) - stonewater_data = stonewater_data.drop(columns=["BASE Primary energy (13a-272)"]) - - # Add on organisation reference - original_archetypes = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 " - "- Archetyped V3.1.xlsx", - header=4 - ) - original_archetypes = original_archetypes[~pd.isnull(original_archetypes["Address ID"])] - original_archetypes = original_archetypes[original_archetypes["Address ID"] != "Address ID"] - original_archetypes["Address ID"] = original_archetypes["Address ID"].astype(int) - - stonewater_data = stonewater_data.merge( - original_archetypes[["Address ID", 'Org. ref.']], - on="Address ID", - how="left" - ) - - # Save this data to excel - stonewater_data.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages V4.xlsx", index=False) - - cost_sheet = [ - { - "measure": "EWI 0.30 w.m2.K", "cost": 298.35, "unit": "m2" - }, - { - "measure": "CWI RdSAP Default", "cost": 14.21, "unit": "m2" - }, - { - "measure": "Poss Extract CWI & Refill (issues identified)", "cost": 14.21 + 25, "unit": "m2" - }, - { - "measure": "IWI 0.30 w.m2.K", "cost": 244.80, "unit": "m2" - }, - { - "measure": "EWI/IWI 0.3", "cost": (298.35 + 244.8) / 2, "unit": "m2" - }, - { - "measure": "Loft Insulation 0.11 w.m2.K", "cost": 16.07, "unit": "m2" - }, - { - "measure": "Flat Roof 0.11 w.m2.K", "cost": 195, "unit": "m2" - }, - { - "measure": "DG Window 1.30 w.m2.K", "cost": 1140, "unit": "each" - }, - { - "measure": "Secondary 2.40", "cost": 974, "unit": "each" - }, - { - "measure": "Ins. Door 1.30 w.m2.K", "cost": None, "unit": "each" - }, - { - "measure": "Ins. Door 1.40 w.m2.K", "cost": None, "unit": "each" - }, - { - "measure": "DMEV", "cost": 900, "unit": "each" - }, - { - "measure": "ASHP Vaillant 102607 5kw", "cost": None, "unit": "each" - }, - { - "measure": "HHRSH Quantum 150", "cost": None, "unit": "each" - }, - { - "measure": "Dual Stat Tank 210lt 50mm Foam", "cost": None, "unit": "each" - }, - { - "measure": "Dual Stat Tank 160lt 50mm Foam", "cost": None, "unit": "each" - }, - { - "measure": "Dual Stat Tank 110lt 50mm Foam", "cost": None, "unit": "each" - }, - { - "measure": "Smart Thermostat", "cost": 1200, "unit": "each" - }, - { - "measure": "TRV's", "cost": 350, "unit": "each" - }, - { - "measure": "Solar PV - 3.0kwp", "cost": 4365.0, "unit": "each" - }, - { - "measure": "Solar PV - 1.5kwp", "cost": 3881, "unit": "each" - }, - { - "measure": "LEL", "cost": 35, "unit": "per bulb" - }, - { - "measure": "Roof 0.16 - Walls 0.30", "cost": 180, "unit": "floor area m2" - }, - { - "measure": "Roof 0.16 - Walls 0.16", "cost": 180, "unit": "floor area m2" - }, - ] - cost_sheet = pd.DataFrame(cost_sheet) - - # Save cost sheet - ideally this will be used as a secondary sheet for Stonewater - cost_sheet.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - cost sheet.xlsx", index=False) - - # stonewater_data[~pd.isnull(stonewater_data["Room in Roof"])]["survey_folder"].values - - create_proposed_wave_3_bid( - costed_packages_filepath=os.path.join( - CUSTOMER_FOLDER_PATH, "Stonewater - Costed Retrofit Packages 20241030 (WIP) Single Model V3.xlsx" - ), - archetypes_sheet_filepath=os.path.join( - CUSTOMER_FOLDER_PATH, "Stonewater SHDF_3_0_Board Triage 22.05.24 - Archetyped V3.1.xlsx" - ) - ) - - -def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepath): - # We read in the costed packages - costed_packages = pd.read_excel(costed_packages_filepath, header=13, sheet_name="Modelled Packages") - costed_packages = costed_packages[~pd.isnull(costed_packages["Address"])] - - archetypes_to_cost = costed_packages[ - [ - "Name", "Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Modelled SAP Band", - "Modelled SAP Rating", "Package Ref", 'Total Cost of Measures', 'Contingency Cost', - 'Total Cost of Measures inc Contingency', 'Main Roof Type', 'Main Roof Insulation', - 'Main Roof Insulation Thickness', 'Existing Primary Heating System', - 'Existing Primary Heating PCDF Reference' - ] - ].copy() - - # Combine 'Main Roof Type', 'Main Roof Insulation', 'Main Roof Insulation Thickness', separating by colons! - archetypes_to_cost['Surveyed Main Roof'] = ( - archetypes_to_cost['Main Roof Type'] + ': ' + archetypes_to_cost['Main Roof Insulation'] + ': ' + - archetypes_to_cost['Main Roof Insulation Thickness'].astype(str) - ) - - # Combine the heating systems, separating by colons! - archetypes_to_cost['Surveyed Main Heating'] = ( - archetypes_to_cost['Existing Primary Heating System'] + ': code - ' + archetypes_to_cost[ - 'Existing Primary Heating PCDF Reference'].astype(str) - ) - - archetypes_to_cost = archetypes_to_cost.drop( - columns=['Main Roof Type', 'Main Roof Insulation', 'Main Roof Insulation Thickness', - 'Existing Primary Heating System', - 'Existing Primary Heating PCDF Reference']) - - # We take properties that are EPC D and below (59% of units) - archetypes_to_cost = archetypes_to_cost[archetypes_to_cost["Current EPC Band"].isin(["D", "E", "F", "G"])] - - archetypes_to_cost["Has been modelled"] = ~pd.isnull(archetypes_to_cost["Modelled SAP Band"]) - - # These are the Arhetypes that will likely be suitable for Wave 3 - archetypes_sheet = pd.read_excel(archetypes_sheet_filepath, header=4) - archetypes_sheet = archetypes_sheet[~pd.isnull(archetypes_sheet["Address ID"])] - archetypes_sheet = archetypes_sheet[archetypes_sheet["Address ID"] != "Address ID"] - archetypes_sheet["Address ID"] = archetypes_sheet["Address ID"].astype(int) - - # We merge the property details onto the costed archetypes - archetypes_to_cost = archetypes_to_cost.merge( - archetypes_sheet[["Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]], - on="Address ID", - how="left" - ) - - proposed_sample = archetypes_sheet[ - archetypes_sheet["Archetype ID"].astype(str).isin(archetypes_to_cost["Archetype ID"].astype(int).astype(str)) - ] - - not_proposed = archetypes_sheet[ - ~archetypes_sheet["Archetype ID"].astype(str).isin(archetypes_to_cost["Archetype ID"].astype(int).astype(str)) - ] - - # archetypes_without_survey = [] - # for p in list(set(not_proposed)): - # filtered = costed_packages[costed_packages["Archetype ID"].astype(int).astype(str) == p] - # if filtered.empty: - # archetypes_without_survey.append(p) - - # Can we propose anything about archetypes that were not surveyed? - - proposed_sample = proposed_sample[ - [ - "Name", "Postcode", "UPRN", "UDPRN", "Address ID", "Osm. ID", "Archetype ID", - "Property Type", "Wall Type", "Roof Type", "Heating" - ] - ] - - # We classify into high and low confidence - - archetypes_to_cost["Surveyed Main Roof"] = archetypes_to_cost["Surveyed Main Roof"].fillna("") - - match_classification = [] - for _, home in tqdm(proposed_sample.iterrows(), total=len(proposed_sample)): - - surveyed = archetypes_to_cost[archetypes_to_cost["Archetype ID"] == home["Archetype ID"]].copy() - surveyed["Package Ref"] = surveyed["Package Ref"].astype(str) - - package = " or ".join(sorted([x for x in surveyed["Package Ref"].unique() if x.strip()])) - package = package.replace("\n", "") - - surveyed_roofs = " or ".join(sorted([x for x in surveyed["Surveyed Main Roof"].unique() if x.strip()])) - surveyed_roofs = surveyed_roofs.replace("\n", "") - - surveyed_heating = " or ".join(sorted([x for x in surveyed["Surveyed Main Heating"].unique() if x.strip()])) - surveyed_heating = surveyed_heating.replace("\n", "") - - # We now check if we have a perfect match - surveyed = surveyed[ - (surveyed["Property Type"] == home["Property Type"]) & - (surveyed["Wall Type"] == home["Wall Type"]) & - (surveyed["Roof Type"] == home["Roof Type"]) & - (surveyed["Heating"] == home["Heating"]) - ] - - if surveyed.empty: - if package == "2B2A": - raise Exception("Fix me") - match_classification.append( - { - "Address ID": home["Address ID"], - "Match to Surveyed": "Approximate", - "Proposed Package Ref": package, - "Surveyed Archetype Roofs": surveyed_roofs, - "Surveyed Archetype Heating": surveyed_heating - } - ) - continue - # Re-do - package = " or ".join(sorted([x for x in surveyed["Package Ref"].unique() if x.strip()])) - package = package.replace("\n", "") - surveyed_roofs = " or ".join(sorted([x for x in surveyed["Surveyed Main Roof"].unique() if x.strip()])) - surveyed_roofs = surveyed_roofs.replace("\n", "") - surveyed_heating = " or ".join(sorted([x for x in surveyed["Surveyed Main Heating"].unique() if x.strip()])) - surveyed_heating = surveyed_heating.replace("\n", "") - - match_classification.append( - { - "Address ID": home["Address ID"], - "Match to Surveyed": "Exact", - "Proposed Package Ref": package, - "Surveyed Archetype Roofs": surveyed_roofs, - "Surveyed Archetype Heating": surveyed_heating - } - ) - - match_classification = pd.DataFrame(match_classification) - - proposed_sample = proposed_sample.merge( - match_classification, - on="Address ID", - how="left", - ) - - # Merge on the cost per archetype - cost_per_archetype = ( - archetypes_to_cost.groupby("Archetype ID")[['Total Cost of Measures inc Contingency']].mean().reset_index() - ) - proposed_sample = proposed_sample.merge( - cost_per_archetype, - on="Archetype ID", - how="left" - ) - - # We add on a boolean to indicate if a property from that archetype has been modelled - proposed_sample = proposed_sample.merge( - archetypes_to_cost.groupby("Archetype ID")[["Has been modelled"]].any().reset_index(), - on="Archetype ID", - how="left" - ) - - proposed_sample["Total Cost of Measures inc Contingency"] = np.where( - ~proposed_sample["Has been modelled"], - None, proposed_sample["Total Cost of Measures inc Contingency"] - ) - - proposed_sample = proposed_sample.sort_values("Archetype ID", ascending=True) - - # Save excel - proposed_sample.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - Proposed Wave 3 Bid V2 (WIP).xlsx", index=False) - - # For each postcode that's in the bid, we also summarise the number of units in the bid and number left out - proposed_sample_postcodes = proposed_sample["Postcode"].unique() - - postcode_summary = [] - for postcode in proposed_sample_postcodes: - in_proposal = proposed_sample[proposed_sample["Postcode"] == postcode] - not_in_proposal = not_proposed[not_proposed["Postcode"] == postcode] - postcode_summary.append( - { - "Postcode": postcode, - "Number of properties in Proposal": len(in_proposal), - "Number of properties not in Proposal": len(not_in_proposal) - } - ) - postcode_summary = pd.DataFrame(postcode_summary) - postcode_summary = postcode_summary.sort_values( - "Number of properties not in Proposal", - ascending=False).reset_index(drop=True) - - postcode_summary.to_excel( - CUSTOMER_FOLDER_PATH + "/Stonewater - Proposed Wave 3 Bid Postcode Summary.xlsx", index=False - ) - - -def find_remaining_surveys(): - """ - This compares a list of properties that have been surveyed against a list of properties that I have produced - costed retrofit packages for, so I know what needs to be downloaded from Sharepoint - :return: - """ - - surveyed = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater" - "/Stonewater_SHDF_3_0_Board_work_in_progress_- 07.11.24.xlsx", - header=4 - ) - - costed = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater - Costed Retrofit Packages " - "20241030 (WIP) MR Review v1.xlsx", - header=13, - sheet_name="Modelled Packages" - ) - costed = costed[~pd.isnull(costed["Address ID"])] - - needed = surveyed[~surveyed["Address ID"].isin(costed["Address ID"])] - - needed["id"] = needed["Archetype ID"].astype(str) + "-" + needed["Arch. Group Rank"].astype(str) - needed = needed.sort_values("id", ascending=True) - needed[["id", "Name", "Postcode"]].to_csv( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/needed_surveys.csv" - ) - - assert needed.shape[0] + costed.shape[0] == surveyed.shape[0] - - -def append_stonewater_id(): - """ - This completes an adhoc request from Stonewater to add in their organisation Reference onto the model - :return: - """ - - model_proposed_sample = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater - Bid Packages WIP 13.11.24.xlsx", - sheet_name="Modelled Packages", - header=13 - ) - model_proposed_sample = model_proposed_sample[~pd.isnull(model_proposed_sample["Address ID"])] - model_proposed_sample["Address ID"] = model_proposed_sample["Address ID"].astype(int) - - original_archetypes = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 " - "- Archetyped V3.1.xlsx", - header=4 - ) - original_archetypes = original_archetypes[~pd.isnull(original_archetypes["Address ID"])] - original_archetypes = original_archetypes[original_archetypes["Address ID"] != "Address ID"] - original_archetypes["Address ID"] = original_archetypes["Address ID"].astype(int) - - matched = model_proposed_sample.merge( - original_archetypes[["Address ID", 'Org. ref.']], - on="Address ID", - how="left" - ) - - if pd.isnull(matched["Org. ref."]).sum(): - raise ValueError("Something went wrong") - - # Save as CSV - matched.to_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater IDs.xlsx", - sheet_name="Proposed Wave 3 Sample", - index=False - ) - - -def propsed_wave_3_sample(): - """ - Stonewater want to ensure that the properties that when selecting properties for wave 3, they choose properties - such that most of the properties within a geographical area are treatable within the bid. - Name, if we take a geographical area (which could be postal region) they want the most, and ideally all, of the - properties within that geographical area to be included within the bid - :return: - """ - - asset_list = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 " - "- Archetyped V3.1.xlsx", - header=4 - ) - - # TODO: We drop 7 properties missing - # UPRN - asset_list = asset_list[~asset_list["Archetype ID"].isin(["MISSING UPRN"])] - # Clean address ids - asset_list = asset_list[~pd.isnull(asset_list["Address ID"])] - asset_list = asset_list[asset_list["Address ID"] != "Address ID"] - asset_list["Address ID"] = asset_list["Address ID"].astype(int) - - asset_list["Street name"] = np.where( - pd.isnull(asset_list["Street name"]), - asset_list["Postcode"], - asset_list["Street name"] - ) - - # Create the postal region, taking the first part of the postcode - asset_list["Postal Region"] = asset_list["Postcode"].str.split(" ").str[0] - asset_list["Street and Region"] = asset_list["Street name"] + " " + asset_list["Postal Region"] - unique_postal_regions = asset_list["Postal Region"].unique() - - # Keep just the columns we need - asset_list = asset_list[ - ["UPRN", "Address ID", 'Org. ref.', "Archetype ID", "Postal Region", "Name", "Postcode", "Street and Region", - "Property Type", "Wall Type", "Roof Type", "Heating"] - ] - - survey_results = pd.read_excel( - os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.19 V2.xlsx"), - header=13, - sheet_name="Modelled Packages" - ) - - survey_results = survey_results[ - [ - "Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Postcode", - "Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness", - "Existing Primary Heating System", - "Package Ref", - "Main Wall Type", "Main Wall Insulation Type", "Main Wall Thickness", - "Main Building Alternative Wall Type", "Main Building Alternative Wall Insulation", - "Main Building Alternative Wall Thickness" - ] - ].rename( - columns={ - "Existing Primary Heating System": "Survey: Primary Heating System" - } - ) - - survey_results["Postal Region"] = survey_results["Postcode"].str.split(" ").str[0] - # Concatenate from the wall information - survey_results["Survey: Main Wall Type"] = survey_results["Main Wall Type"].astype(str) + ": " + survey_results[ - "Main Wall Insulation Type"].astype(str) - # Alternative wall - survey_results["Survey: Main Alternative Wall"] = ( - survey_results["Main Building Alternative Wall Type"].astype(str) + ": " + survey_results[ - "Main Building Alternative Wall Insulation"].astype(str) - ) - # Roof information - survey_results["Survey: Main Roof Type"] = survey_results["Main Roof Type"].astype(str) + ": " + survey_results[ - "Main Roof Insulation"].astype(str) + ": " + survey_results["Main Roof Insulation Thickness"].astype(str) - - # Drop the individual columns: - survey_results = survey_results.drop( - columns=[ - "Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness", - "Main Wall Type", "Main Wall Insulation Type", - "Main Building Alternative Wall Type", "Main Building Alternative Wall Insulation" - ] - ) - - survey_results_with_original_features = survey_results.merge( - asset_list[["UPRN", "Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]], - on="Address ID", - how="left" - ) - - if survey_results_with_original_features.shape[0] != survey_results.shape[0]: - raise ValueError("Something went wrong") - - # Against properties that have NO package ref, we assign a package ref - properties_with_packages = survey_results_with_original_features[ - ~pd.isnull(survey_results_with_original_features["Package Ref"]) - ] - - properties_without_packages = survey_results_with_original_features[ - (survey_results_with_original_features["Current SAP Rating"] < 69) & pd.isnull( - survey_results_with_original_features["Package Ref"] - ) - ] - - # Change this to a lookup - package_ratings = pd.DataFrame([ - { - "1A": 1, - "1B": 2, - "2A": 3, - "2B": 4, - "3A": 5, - "3B": 6, - 4: 7 - } - ]) - package_ratings = pd.melt(package_ratings, var_name="Package Ref", value_name="Rank") - - mapped_package_refs = [] - for _, property in tqdm(properties_without_packages.iterrows(), total=len(properties_without_packages)): - # Same archetype? - matches = properties_with_packages[properties_with_packages["Archetype ID"] == property["Archetype ID"]] - - if matches.empty: - # Similar property - matches = properties_with_packages[ - (properties_with_packages["Property Type"].str.split(":").str[0] == - property["Property Type"].split(":")[0]) & - (properties_with_packages["Wall Type"] == property["Wall Type"]) & - (properties_with_packages["Roof Type"].str.split(":").str[0] == property["Roof Type"].split(":")[0]) & - (properties_with_packages["Heating"].str.split(":").str[0] == property["Heating"].split(":")[0]) - ] - if matches.empty: - matches = properties_with_packages[ - (properties_with_packages["Property Type"].str.split(":").str[0] == - property["Property Type"].split(":")[0]) & - (properties_with_packages["Wall Type"].str.split(":").str[0] == property["Wall Type"].split(":")[0]) & - (properties_with_packages["Roof Type"].str.split(":").str[0] == property["Roof Type"].split(":")[0]) & - (properties_with_packages["Heating"].str.split(":").str[0] == property["Heating"].split(":")[0]) - ] - if matches.empty: - raise Exception("Implement me") - if matches.shape[0] > 1: - # Take the package with the highest rank - matches = matches.merge( - package_ratings, - on="Package Ref", - how="left" - ).sort_values("Rank", ascending=False).head(1) - - mapped_package_refs.append( - { - "Address ID": property["Address ID"], - "Matched Package Ref": matches["Package Ref"].values[0] - } - ) - - mapped_package_refs = pd.DataFrame(mapped_package_refs) - - survey_results = survey_results.merge( - mapped_package_refs, - on="Address ID", - how="left" - ) - survey_results["Package Ref"] = np.where( - pd.notnull(survey_results["Matched Package Ref"]), - survey_results["Matched Package Ref"], - survey_results["Package Ref"] - ) - survey_results = survey_results.drop(columns=["Matched Package Ref"]) - - # Do the same with survey_results_with_original_features - survey_results_with_original_features = survey_results_with_original_features.merge( - mapped_package_refs, - on="Address ID", - how="left" - ) - survey_results_with_original_features["Package Ref"] = np.where( - pd.notnull(survey_results_with_original_features["Matched Package Ref"]), - survey_results_with_original_features["Matched Package Ref"], - survey_results_with_original_features["Package Ref"] - ) - survey_results_with_original_features = survey_results_with_original_features.drop(columns=["Matched Package Ref"]) - - # Save the data for reference - # mapped_package_refs = mapped_package_refs.merge( - # asset_list[["Name", "Postcode", "Address ID", "Org. ref."]], - # on="Address ID", - # how="left" - # ) - # mapped_package_refs.to_csv(os.path.join(CUSTOMER_FOLDER_PATH, "mapped_package_refs.csv"), index=False) - - # We get longitude & Latitude - archetyping_spatial_features = read_pickle_from_s3( - bucket_name="retrofit-data-dev", s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl", - ) - archetyping_spatial_features = pd.concat(archetyping_spatial_features) - archetyping_spatial_features = archetyping_spatial_features[["UPRN", 'LATITUDE', 'LONGITUDE']].rename( - columns={"LATITUDE": "latitude", "LONGITUDE": "longitude"} - ) - # Merge them onto both datasets - asset_list = asset_list.merge( - archetyping_spatial_features, how="left", on="UPRN" - ) - if pd.isnull(asset_list["longitude"]).sum(): - raise ValueError("Something went wrong") - - survey_results_with_original_features = survey_results_with_original_features.merge( - archetyping_spatial_features, how="left", on="UPRN" - ) - if pd.isnull(survey_results_with_original_features["longitude"]).sum(): - raise ValueError("Something went wrong") - - def haversine(lat1, lon1, lat2, lon2): - # Radius of Earth in meters - R = 6371000 - - # Convert degrees to radians - lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2]) - - # Differences - dlat = lat2 - lat1 - dlon = lon2 - lon1 - - # Haversine formula - a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2 - c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a)) - distance = R * c - return distance - - # Tier definitions - # Tier 1: We have a property in the same postal region and same archetype that was surveyed and is below EPC D - # Tier 2: We have a property in the same archetype that was surveyed and is below EPC D - # - - def match_property_to_surveyed(property, survey_results_with_original_features): - surveyed = survey_results_with_original_features[ - ( - survey_results_with_original_features["Postal Region"] == - property["Postal Region"] - ) & - ( - survey_results_with_original_features["Property Type"] == - property["Property Type"] - ) - & - ( - survey_results_with_original_features["Wall Type"].str.split(":").str[0] == - property["Wall Type"].split(":")[0] - ) & - ( - survey_results_with_original_features["Roof Type"].str.split(":").str[0] == - property["Roof Type"].split(":")[0] - ) & - ( - survey_results_with_original_features["Heating"].str.split(":").str[0] == - property["Heating"].split(":")[0] - ) - ].copy() - - if not surveyed.empty: - return surveyed - - surveyed = survey_results_with_original_features[ - ( - survey_results_with_original_features["Postal Region"] == - property["Postal Region"] - ) & - ( - survey_results_with_original_features["Property Type"].str.split(":").str[0] == - property["Property Type"].split(":")[0] - ) - & - ( - survey_results_with_original_features["Wall Type"].str.split(":").str[0] == - property["Wall Type"].split(":")[0] - ) & - ( - survey_results_with_original_features["Roof Type"].str.split(":").str[0] == - property["Roof Type"].split(":")[0] - ) & - ( - survey_results_with_original_features["Heating"].str.split(":").str[0] == - property["Heating"].split(":")[0] - ) - ].copy() - - # surveyed = survey_results_with_original_features[ - # ( - # survey_results_with_original_features["Property Type"] == - # property["Property Type"] - # ) & - # ( - # survey_results_with_original_features["Wall Type"] == - # property["Wall Type"] - # ) & - # ( - # survey_results_with_original_features["Roof Type"].str.split(":").str[0] == - # property["Roof Type"].split(":")[0] - # ) & - # ( - # survey_results_with_original_features["Heating"] == - # property["Heating"] - # ) - # ].copy() - - if not surveyed.empty: - return surveyed - - surveyed = survey_results_with_original_features[ - ( - survey_results_with_original_features["Property Type"] == - property["Property Type"] - ) & - ( - survey_results_with_original_features["Wall Type"] == - property["Wall Type"] - ) & - ( - survey_results_with_original_features["Roof Type"].str.split(":").str[0] == - property["Roof Type"].split(":")[0] - ) & - ( - survey_results_with_original_features["Heating"].str.split(":").str[0] == - property["Heating"].split(":")[0] - ) - ].copy() - - return surveyed - - def fill_survey_columns(region_assets, suffix): - for col in [ - 'Current EPC Band', 'Current SAP Rating', - 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', - 'Survey: Main Roof Type', 'Survey: Primary Heating System', - 'Survey: Matching Address ID', 'Distance to Closest Match (m)', - "Package Ref" - ]: - region_assets[col] = np.where( - pd.isnull(region_assets[col]) & pd.notnull(region_assets[col + suffix]), - region_assets[col + suffix], region_assets[col] - ) - return region_assets - - survey_attribute_columns = [ - "Survey: Main Wall Type", 'Survey: Main Alternative Wall', 'Survey: Main Roof Type', - 'Survey: Primary Heating System' - ] - - survey_results["Survey: Matching Address ID"] = survey_results["Address ID"].copy() - - results = [] - for region in tqdm(unique_postal_regions): - # Take all of the properties in that region - region_assets = asset_list[asset_list["Postal Region"] == region].copy() - - # We have a tier 1 match if the property itself was surveyed - exact_surveyed = survey_results[ - survey_results["Address ID"].isin(region_assets["Address ID"]) - ] - - region_assets = region_assets.merge( - exact_surveyed[ - ["Address ID", "Current EPC Band", "Current SAP Rating"] + survey_attribute_columns + [ - "Survey: Matching Address ID", "Package Ref" - ] - ], - on="Address ID", - how="left" - ) - region_assets['Distance to Closest Match (m)'] = None - region_assets["Distance to Closest Match (m)"] = np.where( - ~pd.isnull(region_assets["Current EPC Band"]), - 0, - region_assets["Distance to Closest Match (m)"] - ) - - # Label the tier 1 properties - region_assets["Confidence Tier"] = None - region_assets["Confidence Tier"] = np.where( - region_assets["Current EPC Band"].isin(["D", "E", "F", "G"]), - "1 - property was surveyed", region_assets["Confidence Tier"] - ) - - region_assets["Confidence Tier"] = np.where( - region_assets["Current EPC Band"].isin(["C", "B", "A"]), - "5 - property was surveyed", region_assets["Confidence Tier"] - ) - - archetype_ids = region_assets[ - pd.isnull(region_assets["Confidence Tier"]) - ]["Archetype ID"].unique() - # We get the properties that have been surveyed - - region_surveyed = [] - for arch_id in archetype_ids: - for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows(): - archetype_data = survey_results_with_original_features[ - survey_results["Archetype ID"] == arch_id - ].copy() - if archetype_data.empty: - continue - - match_type = "2 - same archetype" - if any(archetype_data["Postal Region"] == property["Postal Region"]): - match_type = "1 - same archetype, same postal region" - archetype_data = archetype_data[ - archetype_data["Postal Region"] == property["Postal Region"] - ] - - if archetype_data.shape[0] > 1: - # Look for an exact match, or as close as possible - archetype_data_filtered = match_property_to_surveyed(property, archetype_data) - if not archetype_data_filtered.empty: - archetype_data = archetype_data_filtered - - archetype_data["distance_meters"] = haversine( - lat1=property.latitude, lon1=property.longitude, - lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values - ) - expected_sap = np.average( - archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1) - ) - expected_epc = sap_to_epc(expected_sap) - - archetype_data = archetype_data.sort_values("distance_meters", ascending=True) - - # We take the features of the closest matching property - closest_match = archetype_data.iloc[0] - - # Set the package ref - if expected_epc in ["C", "B", "A"]: - package_ref = None - else: - package_ref = archetype_data["Package Ref"].dropna().values[0] - - region_surveyed.append( - { - "Archetype ID": arch_id, - "Address ID": property["Address ID"], - "Current EPC Band": expected_epc, - "Current SAP Rating": expected_sap, - 'Survey: Main Wall Type': closest_match["Survey: Main Wall Type"], - 'Survey: Main Alternative Wall': closest_match["Survey: Main Alternative Wall"], - 'Survey: Main Roof Type': closest_match["Survey: Main Roof Type"], - 'Survey: Primary Heating System': closest_match["Survey: Primary Heating System"], - "Survey: Matching Address ID": closest_match["Address ID"], - 'Distance to Closest Match (m)': closest_match["distance_meters"], - "Package Ref": package_ref, - "Match Type": match_type - } - ) - region_surveyed = pd.DataFrame(region_surveyed) - - if region_surveyed.empty: - region_surveyed = pd.DataFrame( - columns=[ - "Archetype ID", "Address ID", "Current EPC Band", "Current SAP Rating", - 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', 'Survey: Main Roof Type', - 'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)', - "Match Type", "Package Ref" - ] - ) - - starting_shape = region_assets.shape[0] - region_assets = region_assets.merge( - region_surveyed, - on=["Archetype ID", "Address ID"], - how="left", - suffixes=("", "_method1") - ) - if region_assets.shape[0] != starting_shape: - raise ValueError("Something went wrong") - - # Label the tier 1 properties - region_assets["Confidence Tier"] = np.where( - region_assets["Current EPC Band_method1"].isin(["D", "E", "F", "G"]) & - pd.isnull(region_assets["Confidence Tier"]) & ~pd.isnull(region_assets["Match Type"]), - region_assets["Match Type"], region_assets["Confidence Tier"] - ) - - # Handle EPC C - region_assets["Confidence Tier"] = np.where( - region_assets["Current EPC Band_method1"].isin(["C", "B", "F", "G"]) & - pd.isnull(region_assets["Confidence Tier"]), - "5 - EPC C or above", region_assets["Confidence Tier"] - ) - - region_assets = fill_survey_columns(region_assets, suffix="_method1") - - method_1_columns = [c for c in region_assets.columns if c.endswith("_method1")] - region_assets = region_assets.drop(columns=method_1_columns + ["Match Type"]) - - missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist() - - if not missed_addressids: - results.append(region_assets) - continue - - # This means that this archetype was never surveyed and so we need to find a sufficiently similar property - final_missed_matches = [] - for a_id in missed_addressids: - - match_type = "3 - compared to similar properties" - - property = asset_list[asset_list["Address ID"] == a_id].squeeze() - - surveyed = match_property_to_surveyed(property, survey_results_with_original_features) - - if surveyed.empty: - match_type = "3 - compared to similar properties, relaxed" - # In this case, we do one additional check where we filter on everything the same apart from heating, - # where we do a slightly more rough match - surveyed = survey_results_with_original_features[ - ( - survey_results_with_original_features["Property Type"].str.split(":").str[0] == - property["Property Type"].split(":")[0] - ) & - ( - survey_results_with_original_features["Wall Type"].str.split(":").str[0] == - property["Wall Type"].split(":")[0] - ) & - ( - survey_results_with_original_features["Roof Type"].str.split(":").str[0] == - property["Roof Type"].split(":")[0] - ) - ].copy() - - if surveyed.empty: - if property["Property Type"].split(":")[0] in ["House", "Bungalow", "Maisonette"]: - filter_property_types = ["House", "Bungalow", ] - else: - filter_property_types = ["Flat"] - surveyed = survey_results_with_original_features[ - ( - survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( - filter_property_types - ) - ) & - ( - survey_results_with_original_features["Wall Type"].str.split(":").str[0] == - property["Wall Type"].split(":")[0] - ) & - ( - survey_results_with_original_features["Roof Type"].str.split(":").str[0] == - property["Roof Type"].split(":")[0] - ) - ].copy() - - if "Electric" in property["Heating"]: - # Take other electric heating systems - surveyed = surveyed[surveyed["Heating"].str.contains("Electric")] - elif property["Heating"] in [ - "Community Heating Systems: Community boilers only (RdSAP)", - "Community Heating Systems: Community CHP and boilers (RdSAP)" - ]: - # Take other community heating systems - surveyed = surveyed[surveyed["Heating"].str.contains("Community")] - elif property["Heating"] == 'Heat Pump: (from database)': - # Take other heat pumps - surveyed = surveyed[surveyed["Heating"].str.contains("Heat Pump")] - elif property["Heating"] == "Solid fuel room heaters: Open fire in grate": - # Take other properties with room heaters - surveyed = surveyed[surveyed["Heating"].str.contains("room heaters")] - elif "Boiler" in property["Heating"]: - # Take other properties with boilers - surveyed = surveyed[surveyed["Heating"].str.contains("Boiler")] - else: - raise Exception("Fix me") - - if surveyed.empty: - final_missed_matches.append( - { - "Address ID": a_id, - "Confidence Tier": "4 - no similar property, needs survey to confirm", - "Current EPC Band": "Needs Survey", - "Current SAP Rating": "Needs Survey", - 'Survey: Main Wall Type': "Not Surveyed", - "Survey: Main Alternative Wall": "Not Surveyed", - "Survey: Main Roof Type": "Not Surveyed", - "Survey: Primary Heating System": "Not Surveyed", - "Survey: Matching Address ID": "Not Surveyed", - 'Distance to Closest Match (m)': 9999999, - "Package Ref": "Not Surveyed", - } - ) - continue - - # Calculate distance - surveyed["distance_meters"] = haversine( - lat1=property["latitude"], lon1=property["longitude"], - lat2=surveyed["latitude"].values, lon2=surveyed["longitude"].values - ) - surveyed = surveyed.sort_values("distance_meters", ascending=True) - - # Check if we have a postcode match check if surveyed postcode is the same as the property postcode - if any(surveyed["Postcode"] == property["Postcode"]): - surveyed = surveyed[surveyed["Postcode"] == property["Postcode"]] - - if any(surveyed["Postal Region"] == property["Postal Region"]): - surveyed = surveyed[surveyed["Postal Region"] == property["Postal Region"]] - - # Take the 3 nearest - surveyed = surveyed.head(3) - - # perform a weighted mean of SAP rating - the closer the better - expected_sap = np.average( - surveyed["Current SAP Rating"], weights=1 / (surveyed["distance_meters"] + 1) - ) - expected_epc = sap_to_epc(expected_sap) - - if expected_epc in ["C", "B", "A"]: - match_type = "5 - EPC C or above" - - closest_match = surveyed.iloc[0] - - # The closest property may be an EPC C, we we take the package ref from the property that's the nearest - # with non-NA package ref - if expected_epc in ["C", "B", "A"]: - package_ref = None - else: - package_ref = surveyed["Package Ref"].dropna().values[0] - - final_missed_matches.append( - { - "Address ID": a_id, - "Confidence Tier": match_type, - "Current EPC Band": expected_epc, - "Current SAP Rating": expected_sap, - 'Survey: Main Wall Type': closest_match["Survey: Main Wall Type"], - "Survey: Main Alternative Wall": closest_match["Survey: Main Alternative Wall"], - "Survey: Main Roof Type": closest_match["Survey: Main Roof Type"], - "Survey: Primary Heating System": closest_match["Survey: Primary Heating System"], - "Survey: Matching Address ID": closest_match["Address ID"], - 'Distance to Closest Match (m)': closest_match["distance_meters"], - "Package Ref": package_ref - } - ) - continue - - final_missed_matches = pd.DataFrame(final_missed_matches) - - region_assets = region_assets.merge( - final_missed_matches, - on="Address ID", - how="left", - suffixes=("", "_method3") - ) - - region_assets["Confidence Tier"] = region_assets["Confidence Tier"].fillna( - region_assets["Confidence Tier_method3"] - ) - - region_assets = fill_survey_columns(region_assets, suffix="_method3") - - method_3_columns = [c for c in region_assets.columns if c.endswith("_method3")] - region_assets = region_assets.drop(columns=method_3_columns) - - if pd.isnull(region_assets["Current EPC Band"]).sum(): - raise Exception("Something went wrong") - - results.append(region_assets) - - results = pd.concat(results) - - if (pd.isnull(results["Package Ref"]) & (~results["Current EPC Band"].isin(["A", "B", "C"]))).sum(): - raise ValueError("Missing Package Refs") - - # Check if there are missings in current epc band, current sap rating or any of the survey attributes - for c in ( - [ - "Current EPC Band", "Current SAP Rating", "Survey: Matching Address ID", 'Distance to Closest Match (m)'] + - survey_attribute_columns - ): - if pd.isnull(results[c]).sum(): - raise Exception("Something went wrong") - - gain_columns = sorted([x for x in results["Confidence Tier"].unique() if "1 - " in x or "2 - " in x or "3 - " in x]) - loss_columns = sorted([x for x in results["Confidence Tier"].unique() if "4 - " in x or "5 - " in x]) - - def optimise(gain, loss, max_loss=250): - - # Define the coefficients for the objective function (negative because we maximize Gain) - c = -gain - - # Define constraints - A = [loss] # Only 1 constraint for now, total Loss - b = [max_loss] # Maximum total Loss allowed - - # Bounds for each variable (select or not select each row, 0 <= x <= 1) - bounds = [(0, 1) for _ in gain] - - # Solve the problem using linprog with HiGHS solver - result = linprog(c, A_ub=A, b_ub=b, bounds=bounds, method='highs') - if not result.success: - raise Exception("Optimization failed") - - selected_rows = result.x.round().astype(int) # Rounded to 0 or 1 - optimal_gain = -result.fun - - return selected_rows, optimal_gain - - street_summary = results.pivot_table( - index='Street and Region', - columns='Confidence Tier', - aggfunc='size', - fill_value=0 - ).reset_index() - - street_summary["Gain"] = street_summary[gain_columns].sum(axis=1) - street_summary["Loss"] = street_summary[loss_columns].sum(axis=1) - - selected_rows, _ = optimise( - gain=street_summary["Gain"].values, - loss=street_summary["Loss"].values, - max_loss=250 - ) - - street_summary["Selected"] = selected_rows == 1 - print(street_summary[street_summary["Selected"]][["Gain", "Loss"]].sum()) - - selected_streets = street_summary[ - street_summary["Selected"] - ] - - totals = selected_streets[["Gain", "Loss"]].sum() - - bid_size = totals.sum() - print("Bid Size:", bid_size) - total_epc_d_or_below = totals["Gain"] - print("Total EPC D or below:", total_epc_d_or_below) - total_epc_c = totals["Loss"] - print("Total EPC C or above:", total_epc_c) - # Total needing a survey - total_needing_survey = selected_streets[ - "4 - no similar property, needs survey to confirm" - ].sum() - print("Total needing survey:", total_needing_survey) - - # Label final outputs - # We create a summary of packages by street - results["Package Ref"] = results["Package Ref"].fillna("EPC C - No Package") - results["Package Ref"] = results["Package Ref"].astype(str) - results["Package Ref"] = np.where( - results["Package Ref"] == "4.0", "4", results["Package Ref"] - ) - package_summary = results.pivot_table( - index='Street and Region', - columns='Package Ref', - aggfunc='size', - fill_value=0 - ).reset_index() - - assert sum([v for k, v in package_summary.sum().items() if k != "Street and Region"]) == results.shape[0] - - street_bid_structure = street_summary.merge( - package_summary, how="left", on="Street and Region" - ) - street_bid_structure = street_bid_structure.sort_values("Gain", ascending=False) - - individual_units_programme = results.copy() - individual_units_programme["Unit in Programme"] = individual_units_programme["Street and Region"].isin( - street_bid_structure[street_bid_structure["Selected"]]["Street and Region"].values - ) - - # Merge on Stonewaters ID - asset_list_ids = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 " - "- Archetyped V3.1.xlsx", - header=4 - )[["Address ID", "Org. ref."]] - # Clean address ids - asset_list_ids = asset_list_ids[~pd.isnull(asset_list_ids["Address ID"])] - asset_list_ids = asset_list_ids[asset_list_ids["Address ID"] != "Address ID"] - asset_list_ids["Address ID"] = asset_list_ids["Address ID"].astype(int) - - individual_units_programme = individual_units_programme.merge( - asset_list_ids.rename( - columns={"Org. ref.": "Survey: Org. ref.", "Address ID": "Survey: Matching Address ID"} - ), - how="left", - on="Survey: Matching Address ID" - ) - - individual_units_programme["Survey: Org. ref."] = np.where( - (individual_units_programme["Survey: Matching Address ID"] == "Not Surveyed"), - "Not Surveyed", - individual_units_programme["Survey: Org. ref."] - ) - - if pd.isnull(individual_units_programme["Survey: Org. ref."]).sum() or pd.isnull( - individual_units_programme["Org. ref."]).sum(): - raise ValueError("something went wrong") - - for col in ["Survey: Main Roof Type", "Survey: Main Wall Type", "Survey: Main Alternative Wall"]: - individual_units_programme[col] = ( - individual_units_programme[col] - .str.replace(r': nan(?=$|:)', '', regex=True) # Remove ': nan' at the end or before another ':' - .str.replace(r':\s+:', ': ', regex=True) # Replace occurrences of ': :' with ': ' - .str.replace(r'\s+', ' ', regex=True) # Replace multiple spaces with a single space - .str.strip() # Strip leading/trailing spaces - ) - - # Any EPC C properties that have been included should be flagged as potential low carbon heating - selected_epc_c = individual_units_programme[ - (individual_units_programme["Current EPC Band"].isin(["C", "B", "A", "Needs Survey"])) & - (individual_units_programme["Unit in Programme"]) - ] - - flat_wall_map = { - "CA Cavity: F Filled Cavity": False, - "CA Cavity: A As Built": True, - "SO Solid Brick: A As Built": True, - "Not Surveyed": False - } - - heating_map = { - "BGW Post 98 Combi condens. with auto ign.": False, - "BGB Post 98 Regular condens. with auto ign.": False, - "SEK High heat retention storage heaters": False, - "SEB Modern slimline storage heaters": True, - "Not Surveyed": False - } - - infill_data = [] - for _, epc_c_property in selected_epc_c.iterrows(): - if epc_c_property["Property Type"].split(":")[0] == "Flat": - # Look for a wall insulation measure - infill = flat_wall_map[epc_c_property["Survey: Main Wall Type"]] - infill_data.append( - { - "Address ID": epc_c_property["Address ID"], - "Street and Region": epc_c_property["Street and Region"], - "Possible Flat Infill?": infill - } - ) - continue - - infill = heating_map[epc_c_property["Survey: Primary Heating System"]] - infill_data.append( - { - "Address ID": epc_c_property["Address ID"], - "Street and Region": epc_c_property["Street and Region"], - "Low Carbon Heating Infill?": infill - } - ) - infill_data = pd.DataFrame(infill_data) - - individual_units_programme = individual_units_programme.merge( - infill_data[["Address ID", 'Possible Flat Infill?', 'Low Carbon Heating Infill?']], - how="left", on="Address ID" - ) - - for c in ['Possible Flat Infill?', 'Low Carbon Heating Infill?']: - individual_units_programme[c] = individual_units_programme[c].fillna(False) - - infill_by_street = infill_data.pivot_table( - index='Street and Region', - values=['Possible Flat Infill?', 'Low Carbon Heating Infill?'], - aggfunc='sum', - fill_value=0 - ).reset_index() - - street_bid_structure = street_bid_structure.merge( - infill_by_street, how="left", on="Street and Region" - ) - - for c in ['Low Carbon Heating Infill?', 'Possible Flat Infill?']: - street_bid_structure[c] = street_bid_structure[c].fillna(0) - - master_sheet = pd.read_csv( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - " - "master " - "sheet.csv", - encoding='latin1' - ) - master_sheet = master_sheet[["Address ID", "Main Fuel"]] - - individual_units_programme = individual_units_programme.merge( - master_sheet, how="left", on="Address ID" - ) - - street_bid_structure.to_csv( - os.path.join(CUSTOMER_FOLDER_PATH, "Street Bid Structure V2.csv"), index=False - ) - - individual_units_programme.to_csv( - os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme V2.csv"), index=False - ) - - survey_results = pd.read_excel( - os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.19 V2.xlsx"), - header=13, - sheet_name="Modelled Packages" - ) - - indivual_units = pd.read_csv( - os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme V2.csv") - ) - - u_aids = survey_results["Archetype ID"].astype(str).unique() - units_in_bid = indivual_units[indivual_units['Unit in Programme']]["Archetype ID"].astype(str).values - - len({v for v in units_in_bid if str(v) in u_aids}) - len(list(set(units_in_bid))) - - -def identify_incorrect_packages(): - """ - Due to limitations in the data collected during survey, we have some properties that do not have suitable packages - assigned. This function will identify those properties, which can be flagged for Stonewater's review - """ - - units_with_assigned_packages = pd.read_excel( - os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.20 V2.xlsx"), - header=2, - sheet_name="Individual Units Programme" - ) - - # This sheet contains information on the heating systems for properties, so we can flag any units that have - # been labelled as being electric but are actually gas - heating_survey_data = pd.read_excel( - os.path.join(CUSTOMER_FOLDER_PATH, "STOCKBOOK December 2024 data (5).xlsx"), - header=0, - sheet_name="Export" - ) - - units_with_assigned_packages = units_with_assigned_packages.merge( - heating_survey_data[["Asset Reference", "Heating Type"]], how="left", - left_on="Org. ref.", right_on="Asset Reference" - ) - - # Check the different heating types - units_with_assigned_packages["Gas properties: different to Parity"] = ( - ( - units_with_assigned_packages["Heating Type"].isin(["Gas", "Communal Gas"]) - ) & ( - units_with_assigned_packages["Heating"].isin( - [ - "Heat Pump: Electric Heat " - "pumps: Air source heat pump " - "with flow temperature <= 35°C", - "Electric Storage Systems: Fan " - "storage heaters", - "Electric (direct acting) room " - "heaters: Panel, convector or " - "radiant heaters" - ] - ) - ) - ) - - units_with_assigned_packages["Electric properties: different to Parity"] = ( - (units_with_assigned_packages["Heating Type"] == "Electric") & ( - units_with_assigned_packages["Heating"].isin( - [ - "Boiler: A rated Regular Boiler", - "Boiler: F rated Combi", - "No Heating", - "Boiler: A rated CPSU", - "Boiler: G rated Regular Boiler" - ] - ) - ) - ) - - units_with_assigned_packages["Ground Source properties: different to Parity"] = ( - (units_with_assigned_packages["Heating Type"] == "Ground Source") & ( - units_with_assigned_packages["Heating"].isin( - [ - "Heat Pump: Electric Heat pumps: Air source heat pump with flow temperature <= 35°C", - "Electric Storage Systems: Fan storage heaters", - "Electric Storage Systems: High heat retention storage heaters" - ] - ) - ) - ) - - units_with_assigned_packages["LPG properties: different to Parity"] = ( - (units_with_assigned_packages["Heating Type"] == "Lpg") & ( - units_with_assigned_packages["Main Fuel"].isin( - [ - "Gas: Mains Gas", "Solid Fuel: Wood Logs, Gas: Mains Gas" - ] - ) - ) - ) - - units_with_assigned_packages["Solid Fuel properties: different to Parity"] = ( - (units_with_assigned_packages["Heating Type"] == "Solid Fuel") & ( - units_with_assigned_packages["Main Fuel"].isin( - [ - "Gas: Mains Gas" - ] - ) - ) - ) - - # The next check is to identify properties with specific features that are not condusive to specific packages. E.g. - # Solar PV packages for properties that have another dwelling above - # Label properties that have been matched to a package, during coordination, that includes Solar PV and has - # a property with a dwelling above - units_with_assigned_packages["Invalid Roof Type for Solar - coordination to be reviewed"] = ( - (units_with_assigned_packages["Package Ref"].isin(["3A", "3B", "4", 4])) & ( - units_with_assigned_packages["Survey: Main Roof Type"].str.contains("A Another dwelling above") - ) - ) - - # Label properties that have a dwelling above in the Parity data, and weren't surveyed, but have been assigned - # a package that includes solar PV - units_with_assigned_packages["Invalid Roof Type for Solar - coordination to be reviewed"] = ( - (units_with_assigned_packages["Package Ref"].isin(["3A", "3B", "4", 4])) & ( - units_with_assigned_packages["Survey: Main Roof Type"].str.contains("A Another dwelling above") - ) - ) - - # We now iterate through postcodes and find anomalous properties based on the partiy data and survey data - fields_to_check = [ - 'Wall Type Category', - # 'Roof Type Category', - not very interesting - 'Heating', - 'Main Fuel', - 'Survey: Main Wall Type', - # 'Survey: Main Roof Type', - 'Survey: Primary Heating System' - ] - - units_with_assigned_packages['Wall Type Category'] = units_with_assigned_packages['Wall Type'].str.replace( - r'\s*\(.*?\)', '', regex=True - ) - - # Create roof type category by splitting in colon and taking the first part - units_with_assigned_packages['Roof Type Category'] = units_with_assigned_packages['Roof Type'].str.split(':').str[0] - - units_with_assigned_packages["Street, Region and Postcode"] = ( - units_with_assigned_packages["Street and Region"] + ", " + units_with_assigned_packages["Postcode"] - ) - - def check_mixed_types(row): - # Count distinct primary types with non-zero values - primary_types_present = set() - for col in field_counts.columns: - if ':' in col: - primary_type = col.split(':')[0] - if row[col] > 0: # Non-zero count means this type is present - primary_types_present.add(primary_type) - return len(primary_types_present) > 1 # True if more than one primary type - - aggregated_results = {} - for field in fields_to_check: - # Group by postcode and count occurrences of each unique value - field_counts = ( - units_with_assigned_packages.groupby(['Street, Region and Postcode', field]) - .size() - .unstack(fill_value=0) - .reset_index() - ) - - # Calculate dominant value and percentage before modifying the DataFrame - dominant_value = field_counts.iloc[:, 1:].idxmax(axis=1) - dominant_percentage = ( - (field_counts.iloc[:, 1:].max(axis=1) / field_counts.iloc[:, 1:].sum(axis=1)) * 100 - ) - number_of_properties = field_counts.iloc[:, 1:].sum(axis=1) - - # Add these as new columns after computation - field_counts['Dominant Value'] = dominant_value - field_counts['% Dominant'] = dominant_percentage - field_counts['Number of Properties'] = number_of_properties - field_counts['Mixed Type'] = field_counts.apply(check_mixed_types, axis=1) - - # Store the result in the dictionary - aggregated_results[field] = field_counts - - # Let's fetch the EPC data - # Read in the existing EPC data we stored - import json - from utils.s3 import read_from_s3, read_pickle_from_s3 - def read_epc_data(): - epc_data = json.loads( - read_from_s3( - bucket_name="retrofit-data-dev", - s3_file_name="customers/Stonewater/clustering/epc_data.json" - ) - ) - epc_data = pd.DataFrame(epc_data) - - epc_data["uprn"] = np.where( - epc_data["internal_id"] == 1091, - 83143766, - epc_data["uprn"] - ) - epc_data_batch_2 = read_pickle_from_s3( - s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl", - bucket_name="retrofit-data-dev" - ) - epc_data_batch_2 = pd.DataFrame(epc_data_batch_2) - - complete_epcs = pd.concat([epc_data, epc_data_batch_2]) - - return complete_epcs - - epc_data = read_epc_data() - # Get just the fields we want from the EPC: Uprn, Wall, Roof, Heating, Fuel, SAP Score, EPC Band, Date of EPC - epc_data_to_append = epc_data[ - [ - "uprn", "walls-description", "roof-description", "mainheat-description", "main-fuel", - "current-energy-efficiency", "current-energy-rating", "lodgement-date", - "estimated" - ] - ].rename( - columns={ - "uprn": "UPRN", - "walls-description": "EPC: Wall Type", - "roof-description": "EPC: Roof Type", - "mainheat-description": "EPC: Heating", - "mainfuel": "EPC: Main Fuel", - "current-energy-efficiency": "EPC: SAP Score", - "current-energy-rating": "EPC: EPC Band", - "lodgement-date": "EPC: Date of EPC", - "estimated": "EPC Estimated based on Nearby Properties" - } - ) - # Take non-estimated EPCs? - # epc_data_to_append = epc_data_to_append[epc_data_to_append["EPC Estimated based on Nearby Properties"] != True] - # Take the newest EPC per UPRN, based on lodgement date - epc_data_to_append = epc_data_to_append.sort_values("EPC: Date of EPC", ascending=False).drop_duplicates("UPRN") - - epc_data_to_append["EPC: Date of EPC"] = pd.to_datetime(epc_data_to_append["EPC: Date of EPC"]) - # Years since the EPC was lodged - epc_data_to_append["Years since EPC"] = (pd.Timestamp.now() - epc_data_to_append["EPC: Date of EPC"]).dt.days / 365 - epc_data_to_append = epc_data_to_append[epc_data_to_append["UPRN"] != ""] - epc_data_to_append["UPRN"] = epc_data_to_append["UPRN"].astype(int) - - units_with_assigned_packages = units_with_assigned_packages.merge( - epc_data_to_append, how="left", on="UPRN", - ) - - # Read in the wave 2.1 data - wave_2_data = pd.read_excel( - os.path.join( - CUSTOMER_FOLDER_PATH, "Stonewater 2.1 SAP Pre & Post.xlsx" - ), - header=3 - ) - # Remove any where the work is outstanding - wave_2_data = wave_2_data[wave_2_data["Retrofit Assessment"] == "Completed"] - wave_2_data = wave_2_data[~pd.isnull(wave_2_data["Package Approved (Client)"])] - wave_2_data["house_number"] = wave_2_data["Name"].apply(lambda x: SearchEpc.get_house_number(x, "")) - - # Filter postcodes in the units_with_assigned_packages, to find overlapping postcodes - related_to_wave_2 = units_with_assigned_packages[ - units_with_assigned_packages["Postcode"].isin( - wave_2_data["Post Code"].values - ) & ( - ~units_with_assigned_packages["Confidence Tier"].isin( - [ - "1 - same archetype, same postal region", "1 - property was surveyed" - ] - ) - ) - ] - - wave2_matches = [] - for _, home in related_to_wave_2.iterrows(): - # Get the related homes - assigned_wave_2_packages = wave_2_data[ - wave_2_data["Post Code"] == home["Postcode"] - ] - - if assigned_wave_2_packages.shape[0] != 1: - # In this case, we get the closest match based on door number - hn = SearchEpc.get_house_number(home["Name"], home["Postcode"]) - - assigned_wave_2_packages = assigned_wave_2_packages[ - abs(assigned_wave_2_packages["house_number"].astype(int) - int(hn)) == min( - abs(assigned_wave_2_packages["house_number"].astype(int) - int(hn))) - ] - - wave2_matches.append( - { - "UPRN": home["UPRN"], - "2.1 matched address": assigned_wave_2_packages["Name"].values[0], - "2.1 matched address: Package Ref": assigned_wave_2_packages["Package Approved (Client)"].values[0], - "2.1 matched address: Wall Insulation": assigned_wave_2_packages["Wall Insulation"].values[0], - "2.1 matched address: Loft Insulation": assigned_wave_2_packages["Loft Insulation"].values[0], - "2.1 matched address: Ventilation": assigned_wave_2_packages["Ventilation"].values[0], - "2.1 matched address: Windows": assigned_wave_2_packages["Windwos Upgrade"].values[0] - } - ) - - # Store each results to CSV - for field, df in aggregated_results.items(): - df.to_csv( - os.path.join(CUSTOMER_FOLDER_PATH, f"{field} - aggregated results.csv"), index=False - ) - - # Store units_with_assigned_packages - units_with_assigned_packages.to_csv( - os.path.join(CUSTOMER_FOLDER_PATH, "Units with assigned packages - with flags.csv"), index=False - ) - - -def extract_sharepoint_url(x): - if pd.isnull(x): - return "" - return "/".join(parse.urlparse( - x.split(" - http")[1] - ).path.replace("%20", " ").split("/")[-2:]) - - -def revised_model(): - """ - This function implements the revised model for Stonewater, where we are looking at new priority postcodes - This work was undertaken in January 2021. - """ - - # 1) Create the new list of properties - new_priority_postcodes = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Jan 2025 Project/Updated 2025 to 2030 " - "priority list.xlsx" - ) - - original_archetypes = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 " - "- Archetyped V3.1.xlsx", - header=4 - ) - original_archetypes = original_archetypes[~pd.isnull(original_archetypes["Address ID"])] - original_archetypes = original_archetypes[original_archetypes["Address ID"] != "Address ID"] - original_archetypes["Address ID"] = original_archetypes["Address ID"].astype(int) - original_archetypes["UPRN"] = original_archetypes["UPRN"].astype("Int64").astype(str) - - wave_21_folder_name = "Wave 2.1 Surveys - 2" - - # Check if we have all of the addresses - missed = original_archetypes[ - ~original_archetypes["Address ID"].isin(new_priority_postcodes["Address ID"].values) - ]["Archetype ID"].unique() - - assert set(missed) == {'NOT PRIORITY POSTCODE', 'IN WAVE 2.1', 'EPC C OR ABOVE'} - - original_archetypes = original_archetypes[ - ["Address ID", "Archetype ID", "Archetype Group Rank", "UPRN"] - ] - - # Merge these archetypes on to the new priority postcodes - new_priority_postcodes = new_priority_postcodes.merge( - original_archetypes, how="left", on="Address ID" - ) - - # Basic check, should have no rows with missing Archetype ID, where - assert float(new_priority_postcodes[pd.isnull(new_priority_postcodes["Archetype ID"])]["Address ID"].isin( - original_archetypes["Address ID"] - ).sum()) == 0 - - # We pull together the survey data sheet - survey_folders = [] - - # Loop over each survey folder and list its contents - for i in range(1, NUM_FOLDERS + 1): - folder_path = os.path.join(CUSTOMER_FOLDER_PATH, f"StonewaterSurveys_{i}") - if os.path.isdir(folder_path): # Check if folder exists - folder_contents = [os.path.join(f"StonewaterSurveys_{i}", file) for file in os.listdir(folder_path)] - survey_folders.extend(folder_contents) # Append contents to the master list - - wave_21_folders = [ - "1. Herefordshire", - "2. Bedfordshire", - "3. Wiltshire", - "4. Bournemouth", - "5. Coventry", - "6. West Sussex", - "7. Dorset", - "8. Cambridgeshire", - "9. Guildford", - "10. Little Island", - "11. CCS Dorset" - ] - - for wave_2_1_folder in wave_21_folders: - folder_path = os.path.join(CUSTOMER_FOLDER_PATH, wave_21_folder_name, wave_2_1_folder) - if os.path.isdir(folder_path): # Check if folder exists - folder_contents = [os.path.join(wave_21_folder_name, wave_2_1_folder, file) for file in - os.listdir(folder_path)] - survey_folders.extend(folder_contents) # Append contents to the master list - - # We now do a large pull of all of the data - extracted_data = [] - mtp_extracted_data = [] # Additional data to extract from the medium term plans - for survey_folder in tqdm(survey_folders): - survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder) - - # Check that the survey folder is actually a folder - if not os.path.isdir(survey_folder_path): - continue - - # List the folders inside of the survey folder - survey_subfolders = [ - name for name in os.listdir(survey_folder_path) - if os.path.isdir(os.path.join(survey_folder_path, name)) - ] - - # Check if there's a "retrofit assessment" folder - retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None) - - ra_folder = next( - (name for name in survey_subfolders if "ra coordinator info" in name.lower() or "ra info" in name.lower()), - None - ) - - mtp_folder = next( - (name for name in survey_subfolders if "mid-term" in name.lower() or "mtp" in name.lower()), - None - ) - if mtp_folder: - # We have a mid term plan: - mtp_folder_path = os.path.join(survey_folder_path, mtp_folder) - # Get the contents - files and not folder - mtp_contents = [ - os.path.join(mtp_folder, file) for file in os.listdir(mtp_folder_path) - if ".DS_Store" not in file and not os.path.isdir(os.path.join(mtp_folder_path, mtp_folder, file)) - ] - - has_v1 = [ - f for f in mtp_contents if "v1" in f.lower() or "/ss" in f.lower() - ] - - if has_v1: - # Then we go one level deeper - mtp_contents = [ - os.path.join(has_v1[0], f) for f in - os.listdir(os.path.join(survey_folder_path, has_v1[0])) - ] - - # We check the the IMA - for file_name in mtp_contents: - - filepath = os.path.join(survey_folder_path, file_name) - # We expect a pdf so try and parse it - try: - with open(filepath, "rb") as file: - reader = PyPDF2.PdfReader(file) - # Just the first page - text = reader.pages[0].extract_text() - - except Exception as e: - continue - - # We check if this is an IMA - ima_heading_search = re.search( - r"Improvement measure\s+Capital Cost\s+Lifetime of\s*\n\s*measureFuel saving\s*Lifetime fuel", text - ) - - is_ima = bool(ima_heading_search) - if not is_ima: - continue - - # Otherwise, extract: RIR, PV - pv_search = re.search(r"PV \(\d+Kwp\)", text) - has_pv = bool(pv_search) - pv_system = pv_search.group(0) if has_pv else None - - # We perform a second search for PV: - if pv_search is None: - pv_search = re.search("solar pv", text.lower()) - has_pv = bool(pv_search) - pv_system = "Solar PV" if has_pv else None - - rir_search = re.search(r"RIR \(\d+(\.\d+)?\)", text) - has_rir = bool(rir_search) - rir_spec = rir_search.group(0) if has_rir else None - - mtp_extracted_data.append({ - "survey_folder": survey_folder, - "has_pv": has_pv, - "PV System": pv_system, - "RIR Specification": rir_spec, - "has_rir": has_rir - }) - continue - - # If retrofit assessment folder exists, check if it has content - if retrofit_folder or ra_folder: - if retrofit_folder: - retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder) - else: - retrofit_folder_path = os.path.join(survey_folder_path, ra_folder) - - # Check if everything inside is a sub-folder and the number of folders is 2 - items = [item for item in os.listdir(retrofit_folder_path) if item != '.DS_Store'] - all_folders = [os.path.isdir(os.path.join(retrofit_folder_path, item)) for item in items] - if all(all_folders) and len(all_folders) == 2 and "Property Pics" in items: - # Get the folder that isn't Property Pics - retrofit_folder_path = os.path.join( - retrofit_folder_path, [item for item in items if item != "Property Pics"][0] - ) - - if os.listdir(retrofit_folder_path): # If not empty - summary_data = extract_retrofit_pdfs(data_folder_path=retrofit_folder_path) - if summary_data: - summary_data = { - "survey_folder": survey_folder, - **summary_data, - } - extracted_data.append(summary_data) - continue - else: - # Then we have an empty Retrofit Assessment folder - continue - - # If no retrofit folder or it was empty, check files in survey_folder - summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path) - if not summary_data: - if len(survey_subfolders) == 1: - survey_folder_path = os.path.join(survey_folder_path, survey_subfolders[0]) - summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path) - - if summary_data: - summary_data = { - "survey_folder": survey_folder, - **summary_data, - } - extracted_data.append(summary_data) - - retrofit_assessment_data = pd.DataFrame(extracted_data) - mtp_df = pd.DataFrame(mtp_extracted_data) - - # Save - # retrofit_assessment_data.to_csv( - # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 5.csv"), index=False - # ) - # mtp_df.to_csv( - # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 5.csv"), index=False - # ) - retrofit_assessment_data = pd.read_csv( - os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 5.csv"), - ) - mtp_df = pd.read_csv( - os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 5.csv"), - ) - - # There are a few duplicates we just manually drop - mtp_df = mtp_df.drop_duplicates() - mtp_df = mtp_df[ - ~(( - mtp_df["survey_folder"] == "Wave 2.1 Surveys - 2/1. Herefordshire/(043) Manor Fields 27" - ) & (~mtp_df["has_pv"])) - ] - - mtp_df = mtp_df[ - ~(( - mtp_df["survey_folder"] == "Wave 2.1 Surveys - 2/2. Bedfordshire/(147) Gilpin Close 5" - ) & (~mtp_df["has_pv"])) - ] - - # Remove some definite duplicates - dupes = retrofit_assessment_data[retrofit_assessment_data["Address"].duplicated()]["Address"] - dupes = retrofit_assessment_data[retrofit_assessment_data["Address"].isin(dupes)] - dupes = dupes.sort_values("Address") - # Get all of the folders that end with ROSS - to_drop = dupes[dupes["survey_folder"].str.endswith("ROSS")]["survey_folder"].unique().tolist() - - # Replace \n with "" - retrofit_assessment_data["Postcode"] = retrofit_assessment_data["Postcode"].str.replace("\n", "") - - retrofit_assessment_data = retrofit_assessment_data[ - ~retrofit_assessment_data["survey_folder"].isin( - [ - "StonewaterSurveys_10/4 Beech Road, LUTON, LU1 1DP ROSS", - "StonewaterSurveys_2/135 Runley Road, LUTON, LU1 1TX ROSS", - "StonewaterSurveys_13/7 Saxon Road, LUTON, LU3 1JR ROSS" - ] + to_drop - ) - ] - - retrofit_assessments_data_columns = [ - 'Current SAP Rating', 'Current EPC Band', 'Primary Energy Use (kWh/yr)', - 'Primary Energy Use Intensity (kWh/m2/yr)', 'Number of Storeys', - 'Fuel Bill', 'Window Age Description', - 'Window Age Description Proportion (%)', - 'Secondary Window Age Description', - 'Secondary Window Age Description Proportion (%)', 'Number of Windows', - 'Total Number of Doors', 'Number of Insulated Doors', - 'Existing Primary Heating System', - 'Existing Primary Heating PCDF Reference', - 'Existing Primary Heating Controls', - 'Existing Primary Heating % of Heat', - 'Existing Secondary Heating System', - 'Existing Secondary Heating PCDF Reference', - 'Existing Secondary Heating Controls', - 'Existing Secondary Heating % of Heat', 'Secondary Heating Code', - 'Water Heating Code', 'Total Floor Area (m2)', - 'Total Ground Floor Area (m2)', 'RIR Floor Area', - 'Main Building Wall Area (m2)', 'First Extension Wall Area (m2)', - 'Number of Light Fittings', 'Number of LEL Fittings', - 'Number of fittings needing LEL', 'Main Roof Type', - 'Main Roof Insulation', 'Main Roof Insulation Thickness', - 'Main Wall Type', 'Main Wall Insulation', 'Main Wall Dry-lining', - 'Main Wall Thickness', 'Main Building Alternative Wall Type', - 'Main Building Alternative Wall Insulation', - 'Main Building Alternative Wall Dry-lining', - 'Main Building Alternative Wall Thickness', - 'Main Fuel', - 'Main Building Age Band', - ] - # For the columns in retrofit_assessments_data_columns, prefix all of them with Survey: - retrofit_assessments_data_columns_prefixed = ["Survey: " + x for x in retrofit_assessments_data_columns] - rename_dict = dict(zip(retrofit_assessments_data_columns, retrofit_assessments_data_columns_prefixed)) - retrofit_assessment_data = retrofit_assessment_data.rename(columns=rename_dict) - retrofit_assessment_data["Survey: Current EPC Band"] = ( - retrofit_assessment_data["Survey: Current SAP Rating"].apply(lambda x: sap_to_epc(x)) - ) - - # We can read in the data as needed - - # Next Step: Read in the coordinated measures and match to the extracted data - ############################################################ - # CCS - ############################################################# - ccs_coordination_sheet = pd.read_excel( - os.path.join( - CUSTOMER_FOLDER_PATH, - "Jan 2025 Project", - "CCS_Installation_Compliance_-_Stonewater_SHDF_2_1_1738228227.xlsx" - ), - header=4 - ) - ccs_postcodes = pd.read_excel( - os.path.join( - CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "CCS_Installation_Compliance_CCS.xlsx" - ), - header=4 - ) - ccs_coordination_sheet = ccs_postcodes[['Name', 'Post Code', 'Asset ID', 'Asset ID.1']].merge( - ccs_coordination_sheet, how="left", on="Name" - ) - ccs_coordination_sheet = ccs_coordination_sheet[~pd.isnull(ccs_coordination_sheet["Name"])] - ccs_coordination_sheet["contractor"] = "CCS" - # We split ccs into two sections - the first being - ccs_coordination_removed_from_programme = ccs_coordination_sheet.tail(21) - ccs_coordination_sheet = ccs_coordination_sheet.head(87) - ccs_coordination = pd.concat([ccs_coordination_removed_from_programme, ccs_coordination_sheet]) - - ccs_coordination["folder_path"] = ccs_coordination["Sharepoint Link"].apply(lambda x: extract_sharepoint_url(x)) - - ############################################################ - # WATES - ############################################################# - wates_coordination_sheet = pd.read_excel( - os.path.join( - CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "Stonewater_SAP_Installation_Compliance_1738229226.xlsx" - ), - header=4 - ) - wates_postcodes = pd.read_excel( - os.path.join( - CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "Stonewater_SAP_Installation_Compliance_Vinci-Wates.xlsx" - ), - header=4 - ) - wates_postcodes = wates_postcodes[~pd.isnull(wates_postcodes["Post Code"])] - wates_coordination_sheet = wates_coordination_sheet.merge( - wates_postcodes[['Name', 'Post Code', 'Asset ID']].drop_duplicates(), how="left", on="Name" - ) - - wates_coordination_sheet["contractor"] = "Wates" - # Break into the different sites: - # Wiltshire - wates_coordination_sheet_wiltshere = wates_coordination_sheet.head(267) - wates_coordination_sheet_herefordshire = wates_coordination_sheet.iloc[271:332, :] - wates_coordination_sheet_coventry = wates_coordination_sheet.iloc[336:409, :] - wates_coordination_sheet_bedfordshire = wates_coordination_sheet.iloc[413:520, :] - wates_coordination_sheet_bournemouth = wates_coordination_sheet.iloc[524:567, :] - wates_coordination_sheet_cambridgeshire = wates_coordination_sheet.iloc[571:581, :] - wates_coordination_sheet_removed_from_programme = wates_coordination_sheet.iloc[586:926, :] - wates_coordination_sheet_abeyance = wates_coordination_sheet.iloc[930:972, :] - - wates_coordination = pd.concat( - [ - wates_coordination_sheet_wiltshere, - wates_coordination_sheet_herefordshire, - wates_coordination_sheet_coventry, - wates_coordination_sheet_bedfordshire, - wates_coordination_sheet_bournemouth, - wates_coordination_sheet_cambridgeshire, - wates_coordination_sheet_removed_from_programme, - wates_coordination_sheet_abeyance - ] - ) - # We correct the Asset ID for 34 Kempster Close - wates_coordination["Asset ID"] = np.where( - wates_coordination["Name"] == "34 Kempster Close", - "12005", - wates_coordination["Asset ID"] - ) - - # We fill the missing ids - missing_lookup = { - "4 Sydnall Fields": 31231, - "12 Sydnall Fields": 31239, - "12 Athena Gardens": 28061, - "49 Banner Lane": 41189, - "4 Jonathan Road": 41232, - "8 Jonathan Road": 41236, - "1 Jonathan Road": 41229, - "96 Taunton Way": 31417, - "94 Taunton Way": 31418, - "1 Lady Lane": 29430, - "10 Jonathan Road": 41283, - "21 Jonathan Road": 41246, - "12 Ashcroft Close": 26399 - } - for name, asset_id in missing_lookup.items(): - wates_coordination["Asset ID"] = np.where( - wates_coordination["Name"] == name, - asset_id, - wates_coordination["Asset ID"] - ) - - wates_coordination = wates_coordination[~pd.isnull(wates_coordination["Asset ID"])] - - wates_coordination["folder_path"] = wates_coordination["Sharepoint Folder"].apply( - lambda x: extract_sharepoint_url(x) - ) - - ############################################################ - # NEW 450 COORDINATED RETROFIT ASSESSMENTS - ############################################################# - features = pd.read_csv( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - " - "master sheet.csv", - encoding='latin1' - ) - features["Address ID"] = features["Address ID"].astype(str).astype(int) - features_to_merge = features[["Address ID", "Organisation Reference"]] - - retrofit_packages_board = pd.read_excel( - os.path.join( - CUSTOMER_FOLDER_PATH, - "Stonewater_SHDF_3_0_Board_work_in_progress_-_Operations_1732034933 Final 19.11.24.xlsx" - ), - header=4 - ) - retrofit_packages_board = retrofit_packages_board[~pd.isnull(retrofit_packages_board["Name"])] - # Take just the rows that have been surveyed - retrofit_packages_board = retrofit_packages_board[ - retrofit_packages_board["RA"].isin(["Invoiced", "Completed"]) - ] - - retrofit_packages_board = retrofit_packages_board.merge( - features_to_merge, how="left", on="Address ID" - ) - - manual_filters = { - "Flat 21 Walmer Street": "StonewaterSurveys_14/91-1-Flat 21 Walmer Street-HR4 9JD", - "6 Cornewall Close": "StonewaterSurveys_14/aa 6, Cornewall Close, Moccas, HEREFORD, HR2 9LG", - "2 Bromyard Road": "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ", - 'Flat 18, 1 Raglan Court': "StonewaterSurveys_13/60-3-18 Raglan Court, 1 Raglan Court-MK41 8QT", - '14 Raglan Court, 1 Devizes Avenue': 'StonewaterSurveys_12/55-3-14 Raglan Court, Devizes Avenue-MK41 8QT', - '19 South Road': 'StonewaterSurveys_4/19 The Oaks, South Road, SMETHWICK, B67 7BY', - 'Flat 12 Pelican Lane': 'StonewaterSurveys_1/121-3-Flat 12 Lynton Court, Pelican Lane-RG14 1NN', - 'Flat C, 44 St Leonards Avenue': 'StonewaterSurveys_11/427-2-44c St. Leonards Avenue-MK42 0RB', - '16 The Crescent, Kington': 'StonewaterSurveys_9/360-3-16 The Crescent-HR5 3AS', - '2 School Lane, Leominster': 'StonewaterSurveys_5/224-1-2 School Lane-HR6 8AA', - '14 South Road': 'StonewaterSurveys_2/14 The Oaks, South Road, SMETHWICK, B67 7BY', - '1 Groves Street': 'StonewaterSurveys_4/19-5-1 Groves Street-SN2 2BW', - '2 Calshot Walk': 'StonewaterSurveys_3/156-3-2 Calshot Walk-MK41 8QS', - '21 Constitution Hill': 'StonewaterSurveys_1/112-11-21 Constitution Hill-BH14 0PX', - '22 Constitution Hill': 'StonewaterSurveys_4/185-8-22 Constitution Hill-BH14 0PX', - '2 Marches Cottages, School Lane, Leominster': 'StonewaterSurveys_5/224-1-2 School Lane-HR6 8AA', - '26, Copthorn House, Brighton Road': 'StonewaterSurveys_15/133-1-26 Brighton Road-KT20 6BQ', - '4, Old St Marys, Ripley Lane': "StonewaterSurveys_15/433-3-4 Ripley Lane-KT24 6JG", - '1 Nelson House, Short Street': 'StonewaterSurveys_15/89-2-1 Short Street-GU11 1HX', - "18 Nelson House, Short Street": 'StonewaterSurveys_15/25-3- 18 Short Street- GU11 1HX', - '3 Nelson House, Short Street': 'StonewaterSurveys_2/138-1-3 Short Street-GU11 1HX', - '16, Copthorn House, Brighton Road': 'StonewaterSurveys_13/78-3-16 Brighton Road-KT20 6BQ', - '20 Nelson House, Short Street': 'StonewaterSurveys_15/89-1-20 Short Street-GU11 1HX', - '7 Croft Street': 'StonewaterSurveys_8/333-2-7 Croft Street-HR6 8LA' - } - - # We now match this retrofit packages board to the extracted data - matching_lookup = [] - for _, home in tqdm(retrofit_packages_board.iterrows(), total=len(retrofit_packages_board)): - - # Handle the case that has the wrong postcode in the asset data - if home["Name"] in manual_filters: - filtered = retrofit_assessment_data[ - retrofit_assessment_data["survey_folder"] == manual_filters[home["Name"]] - ].copy() - else: - filtered = retrofit_assessment_data[ - retrofit_assessment_data["Postcode"].str.lower() == home["Postcode"].lower() - ].copy() - - # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces - to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains( - home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False - ) - if to_filter.sum() == 0: - to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.replace(",", "").str.replace(".", - "").str.contains( - home["Name"].replace(r"[^\w\s]", "").replace(",", ""), case=False - ) - filtered = filtered[to_filter] - - if filtered.empty: - continue - - if filtered.shape[0] == 1: - matching_lookup.append( - { - "survey_folder": filtered["survey_folder"].values[0], - "Address ID": home["Address ID"], - "Name": home["Name"] - } - ) - continue - - # home["Name"] should be contained in the survey_folder - filtered = filtered[filtered["survey_folder"].str.contains(home["Name"], case=False)] - # We have an edge case wher some properties have two outputs in Sharepoint - if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ": - raise Exception("Fix me1") - # filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"] - - if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB': - raise Exception("Fix me2") - # filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"] - - if home["Name"] == '2 Bromyard Road' and home["Postcode"] == 'WR15 8BZ': - filtered = filtered[filtered["survey_folder"] == "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ"] - - if filtered.empty: - continue - if filtered.shape[0] != 1: - raise Exception("something went wrong") - - matching_lookup.append( - { - "survey_folder": filtered["survey_folder"].values[0], - "Address ID": home["Address ID"], - "Name": home["Name"] - } - ) - matching_lookup = pd.DataFrame(matching_lookup) - - ccs_coordination = ccs_coordination.rename( - columns={"Post Code": "Postcode"} - ) - ccs_coordination = ccs_coordination[~pd.isnull(ccs_coordination["Postcode"])] - ccs_coordination = ccs_coordination[ccs_coordination["Retrofit Assessment"] != "Outstanding"] - - ccs_manual_filters = { - "35 Kittiwake Close": f"{wave_21_folder_name}/11. CCS Dorset/Kittiwake Close 35" - } - ccs_matching_lookup = [] - for _, home in tqdm(ccs_coordination.iterrows(), total=len(ccs_coordination)): - - # Handle the case that has the wrong postcode in the asset data - if home["Name"] in ccs_manual_filters: - filtered = retrofit_assessment_data[ - retrofit_assessment_data["survey_folder"] == ccs_manual_filters[home["Name"]] - ].copy() - else: - filtered = retrofit_assessment_data[ - retrofit_assessment_data["Postcode"].str.lower() == home["Postcode"].lower() - ].copy() - - # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces - to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains( - home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False - ) - if to_filter.sum() == 0: - to_filter = ( - filtered["survey_folder"]. - str.replace(r"[^\w\s]", ""). - str.replace(",", ""). - str.replace(".", ""). - str.contains( - home["Name"].replace(r"[^\w\s]", "").replace(",", ""), case=False - ) - ) - if to_filter.sum() == 0: - to_filter = ( - filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("").str.lower() == - home["Name"].lower() - ) - if to_filter.sum() == 0: - to_filter = ( - filtered["Address"].str.replace(" ,", "").str.split(",").str[0:1].str.join("").str.lower() == - home["Name"].lower() - ) - if to_filter.sum() == 0: - # Do a fuzzy match on the name - # Find the best filter - to_filter = filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("").apply( - lambda x: fuzz.partial_ratio(home["Name"], x) > 93 - ) - if to_filter.sum() == 0: - # We also some cases where the name of the survey folder is like "Colville Road 7" and the - # property name is actually 7 Colville Road, so we try taking the final part of the address, - # splitting on space, and adding it to the front - def reformat_survey_folder(x): - filename = x.split("/")[-1] - parts = filename.split(" ") - return " ".join(parts[-1:] + parts[:-1]) - - to_filter = ( - filtered["survey_folder"].apply(lambda x: reformat_survey_folder(x)).str.lower() == - home["Name"].lower() - ) - - if to_filter.sum() == 0: - raise Exception("Error") - filtered = filtered[to_filter] - - if filtered.empty: - continue - - if filtered.shape[0] == 1: - ccs_matching_lookup.append( - { - "survey_folder": filtered["survey_folder"].values[0], - "Asset ID.1": home["Asset ID.1"], - "Name": home["Name"] - } - ) - continue - - raise Exception("No match") - - ccs_matching_lookup = pd.DataFrame(ccs_matching_lookup) - # We get a match for all records - assert ccs_matching_lookup.shape[0] == ccs_coordination.shape[0] - assert not pd.isnull(ccs_matching_lookup["Asset ID.1"]).sum() - assert not ccs_matching_lookup["Asset ID.1"].duplicated().sum() - - # We do the same for Wates - wates_coordination = wates_coordination.rename( - columns={"Post Code": "Postcode"} - ) - wates_coordination = wates_coordination[ - wates_coordination["Retrofit Assessment"].isin(["Completed"]) - ] - wates_coordination = wates_coordination[ - ~pd.isnull(wates_coordination["Postcode"]) - ] - - wates_manual_filters = { - "24 Rabley Wood View": f"{wave_21_folder_name}/3. Wiltshire/24-25 Rabley Wood View", - "14 Edencroft": f"{wave_21_folder_name}/3. Wiltshire/14 Edencroft", - "Flat 31 Rabley Wood View": f"{wave_21_folder_name}/3. Wiltshire/Flat 31 Rabley Wood View", - 'Flat 13, Manor Fields': f'{wave_21_folder_name}/1. Herefordshire/(038) Manor Fields Flat 13', - "4 Kittys Lane": f"{wave_21_folder_name}/1. Herefordshire/(005) Kittys Lane 4", - '1 Jephson Court': f'{wave_21_folder_name}/5. Coventry/Jesphson Court 1', - '2 Jephson Court': f'{wave_21_folder_name}/5. Coventry/Jesphson Court 2', - } - wates_matching_lookup = [] - # Examples to skip when we cannot get the data - wates_to_skip = [ - "66 Abbatt Close", # File type is unusual, couldn't extract the data - "Flat 69 Goddard Road", # Doesn't exist - "19 Garth House", # # File type is unusual, couldn't extract the data - '5 Gilpin Close', # No properly formatted EPR - '49 The Hide, Netherfield', # TODO: TEMP HERE - '19 Chanders Rd', - '5 Chanders Rd', - '23 Chanders Rd', - '3 Chanders Rd', - '1 Orchard Close', - ] - wates_coordination = wates_coordination[~wates_coordination["Name"].isin(wates_to_skip)] - - for _, home in tqdm(wates_coordination.iterrows(), total=len(wates_coordination)): - - # Search the folder - filtered = retrofit_assessment_data[ - retrofit_assessment_data["survey_folder"].str.contains(home["folder_path"], regex=False) - ] - if len(filtered) == 1: - wates_matching_lookup.append( - { - "survey_folder": filtered["survey_folder"].values[0], - "Asset ID": home["Asset ID"], - "Name": home["Name"] - } - ) - continue - - if home["Name"] in wates_to_skip: - continue - - # Handle the case that has the wrong postcode in the asset data - if home["Name"] in wates_manual_filters: - filtered = retrofit_assessment_data[ - retrofit_assessment_data["survey_folder"] == wates_manual_filters[home["Name"]] - ].copy() - else: - filtered = retrofit_assessment_data[ - retrofit_assessment_data["Postcode"].str.lower() == home["Postcode"].lower() - ].copy() - - # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces - to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains( - home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False - ) - - if to_filter.sum() > 1: - to_filter = ( - filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.split("/").str[-1].str.lower() == - home["Name"].replace(r"[^\w\s]", "").lstrip().lower() - ) - - if to_filter.sum() == 0: - to_filter = ( - filtered["survey_folder"]. - str.replace(r"[^\w\s]", ""). - str.replace(",", ""). - str.replace(".", ""). - str.contains( - home["Name"].replace(r"[^\w\s]", "").replace(",", ""), case=False - ) - ) - if to_filter.sum() == 0: - to_filter = ( - filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("").str.lower() == - home["Name"].lower() - ) - if to_filter.sum() == 0: - to_filter = ( - filtered["Address"].str.replace(" ,", "").str.split(",").str[0:1].str.join("").str.lower() == - home["Name"].lower() - ) - if to_filter.sum() == 0: - # Do a fuzzy match on the name - # Find the best filter - to_filter = filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("").apply( - lambda x: fuzz.partial_ratio(home["Name"], x) > 93 - ) - if to_filter.sum() == 0: - # We also some cases where the name of the survey folder is like "Colville Road 7" and the - # property name is actually 7 Colville Road, so we try taking the final part of the address, - # splitting on space, and adding it to the front - def reformat_survey_folder(x): - filename = x.split("/")[-1] - parts = filename.split(" ") - return " ".join(parts[-1:] + parts[:-1]) - - to_filter = ( - filtered["survey_folder"].apply(lambda x: reformat_survey_folder(x)).str.lower() == - home["Name"].lower() - ) - - if to_filter.sum() == 0: - raise Exception("Error") - filtered = filtered[to_filter] - - if filtered.empty: - continue - - if filtered.shape[0] == 1: - wates_matching_lookup.append( - { - "survey_folder": filtered["survey_folder"].values[0], - "Asset ID": home["Asset ID"], - "Name": home["Name"] - } - ) - continue - - raise Exception("No match") - wates_matching_lookup = pd.DataFrame(wates_matching_lookup) - - # We get a match for all records - assert wates_matching_lookup.shape[0] == wates_coordination.shape[0] - assert not pd.isnull(wates_matching_lookup["Asset ID"]).sum() - assert not wates_matching_lookup["Asset ID"].duplicated().sum() - - # Merge lookup tables onto the coordination sheets - wates_coordination = wates_coordination.merge( - wates_matching_lookup, how="left", on="Name" - ) - missed_asset_id = wates_coordination[pd.isnull(wates_coordination["Asset ID_x"])] - if not missed_asset_id.empty: - raise Exception("Missing Asset ID") - - if wates_coordination["Asset ID_x"].duplicated().sum(): - raise Exception("Duplicated IDs in wates") - - # We merge the mpt data on to the wates coordination - wates_coordination = wates_coordination.merge( - mtp_df, how="left", on="survey_folder" - ) - - ccs_coordination = ccs_coordination.merge( - ccs_matching_lookup, how="left", on="Name" - ) - ccs_coordination = ccs_coordination.merge( - mtp_df, how="left", on="survey_folder" - ) - - retrofit_packages_board = retrofit_packages_board.merge( - matching_lookup, how="left", on="Name" - ) - - # We now map the retrofit assessment data to the coordinated packages - wates_coordination = wates_coordination.merge( - retrofit_assessment_data.drop(columns=["Postcode"]), how="left", on="survey_folder" - ) - ccs_coordination = ccs_coordination.merge( - retrofit_assessment_data.drop(columns=["Postcode"]), how="left", on="survey_folder" - ) - retrofit_packages_board = retrofit_packages_board.merge( - retrofit_assessment_data.drop(columns=["Postcode"]), how="left", on="survey_folder" - ) - - # We have 4 properties in the Wates coordination board, that we want to remove from the retrofit packages board - to_remove = wates_coordination[ - wates_coordination["Asset ID_x"].astype(int).isin(retrofit_packages_board["Organisation Reference"]) - ] - assert to_remove.shape[0] == 4 - # Remove them from the wates board - wates_coordination = wates_coordination[ - ~wates_coordination["Asset ID_x"].astype(int).isin(retrofit_packages_board["Organisation Reference"]) - ] - - # We combine this into a singular board - coordinated_packages = pd.concat( - [ - retrofit_packages_board[ - [ - "Name", "Postcode", 'Actual SAP Band', 'Actual SAP Rating', - 'Modelled SAP Band', 'Modelled SAP Rating', 'Package Ref', - 'Main Wall Insulation', 'Secondary Wall Insulation', 'Loft insulation', - 'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade', - 'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls', - 'Solar PV', 'Other measures', 'Organisation Reference', - ] + retrofit_assessments_data_columns_prefixed - ], - ccs_coordination[ - [ - # We don't have secondary wall insulation, Flat Roof, RIR, Heating Controls, - # Solar PV - "Name", "Postcode", 'SAP Band Pre', 'SAP Rating Pre', 'SAP Rating Install Package', - 'SAP Band Install Package', 'Package Approved (Client)', - 'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade', - 'Ventilation', 'Heating', 'Other Measures', 'PV System', - "Asset ID.1_y", - ] + retrofit_assessments_data_columns_prefixed - ].rename( - columns={ - "SAP Band Pre": "Actual SAP Band", - "SAP Rating Pre": "Actual SAP Rating", - 'SAP Rating Install Package': 'Modelled SAP Band', - 'SAP Band Install Package': 'Modelled SAP Rating', - 'Package Approved (Client)': 'Package Ref', - 'Wall Insulation': 'Main Wall Insulation', - 'Loft Insulation': 'Loft insulation', - 'Windows Upgrade': 'Window Upgrade', - 'Ext. Doors Upgrade': 'Door Upgrade', - 'Heating': 'Main Heating', - 'Other Measures': 'Other measures', - 'Asset ID.1_y': 'Organisation Reference', - "PV System": "Solar PV", - } - ), - wates_coordination[ - [ - "Name", "Postcode", 'SAP Band Pre', 'SAP Rating Pre', 'SAP Rating Install Package', - 'SAP Band Install Package', 'Package Approved (Client)', - 'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade', - 'Ventilation', 'Heating', 'Other Measures', 'Asset ID_x', "PV System" - ] + retrofit_assessments_data_columns_prefixed - ].rename( - columns={ - "SAP Band Pre": "Actual SAP Band", - "SAP Rating Pre": "Actual SAP Rating", - 'SAP Rating Install Package': 'Modelled SAP Band', - 'SAP Band Install Package': 'Modelled SAP Rating', - 'Package Approved (Client)': 'Package Ref', - 'Wall Insulation': 'Main Wall Insulation', - 'Loft Insulation': 'Loft insulation', - 'Windows Upgrade': 'Window Upgrade', - 'Ext. Doors Upgrade': 'Door Upgrade', - 'Heating': 'Main Heating', - 'Other Measures': 'Other measures', - 'Asset ID_x': 'Organisation Reference', - "PV System": "Solar PV", - } - ) - ] - ) - - coordinated_packages["Organisation Reference"] = coordinated_packages["Organisation Reference"].astype(int) - assert not coordinated_packages["Organisation Reference"].duplicated().sum() - - # Merge the property features on - coordinated_packages = coordinated_packages.merge( - features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type"]], - how="left", - on="Organisation Reference" - ) - - coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Survey: Current EPC Band"])] - coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Survey: Current SAP Rating"])] - - # We need the features pertaining to these priority postcodes - - def find_nearest_matching_property(coordinated_packages, home): - filter_levels = [ - (["Postcode", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 2), - (["Postal Region", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 3), - (["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 4), - (["Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 5), - (["Primary Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 6), - (["Primary Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 7), - ] - - max_confidence = max([confidence for (_, confidence) in filter_levels]) - - for i, (filters, match_confidence) in enumerate(filter_levels): - match = coordinated_packages.copy() - - for col in filters: - match = match[match[col] == home[col]] - - if not match.empty: - return match, match_confidence - - # Finally, we search for a property in the same Archetype - match = coordinated_packages[coordinated_packages["Archetype ID"] == home["Archetype ID"]] - if not match.empty: - return match, max_confidence + 1 - - return None, None # No match found - - coordinated_packages["Postal Region"] = coordinated_packages["Postcode"].str.split(" ").str[0].str.strip() - new_priority_postcodes["Postal Region"] = new_priority_postcodes["Postcode"].str.split(" ").str[0].str.strip() - - coordinated_packages["Roof Simple"] = coordinated_packages["Roofs"].str.split(":").str[0].str.strip() - new_priority_postcodes["Roof Simple"] = new_priority_postcodes["Roofs"].str.split(":").str[0].str.strip() - - coordinated_packages["Primary Property Type"] = coordinated_packages["Property Type"].str.split(":").str[0] - new_priority_postcodes["Primary Property Type"] = new_priority_postcodes["Property Type"].str.split(":").str[0] - - coordinated_packages = coordinated_packages.merge( - new_priority_postcodes[["Organisation Reference", "Archetype ID"]], - how="left", - on="Organisation Reference" - ) - - # For every property in the priority postcodes data, we look for a most appropriate matching property - no_match = [] - matches = [] - for _, home in tqdm(new_priority_postcodes.iterrows(), total=len(new_priority_postcodes)): - # We check if the property was surveyed - survey_result = coordinated_packages[ - coordinated_packages["Organisation Reference"] == home["Organisation Reference"] - ] - if not survey_result.empty: - to_extend = [ - { - "Organisation Reference": home["Organisation Reference"], - "Best Match Organisation Reference": m, - "match_confidence": 1, - "Was Surveyed": True - } for m in survey_result["Organisation Reference"].values - ] - matches.extend(to_extend) - continue - - closest_match, match_confidence = find_nearest_matching_property(coordinated_packages, home) - if closest_match is None: - no_match.append(home["Organisation Reference"]) - continue - - to_extend = [ - { - "Organisation Reference": home["Organisation Reference"], - "Best Match Organisation Reference": m, - "match_confidence": match_confidence, - "Was Surveyed": False - } for m in closest_match["Organisation Reference"].values - ] - matches.extend(to_extend) - - no_match_summary = new_priority_postcodes[ - new_priority_postcodes["Organisation Reference"].isin( - no_match - ) - ].groupby(["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"])[ - "Organisation Reference"].count().reset_index() - - no_match_summary = no_match_summary.sort_values("Organisation Reference", ascending=False) - - # len(no_match) - # 8764, 5607, 5646, 5071 - # no_match_summary.shape - # (3953, 6), (2948, 6), (2969, 7), (2575, 7) - - matches_df = pd.DataFrame(matches) - - matches_df = matches_df.merge( - coordinated_packages[["Organisation Reference", "Survey: Current EPC Band", "Survey: Current SAP Rating"]], - left_on="Best Match Organisation Reference", right_on="Organisation Reference", - suffixes=("", " - Closest Match") - ) - - measures_columns = [ - 'Main Wall Insulation', 'Secondary Wall Insulation', 'Loft insulation', - 'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade', - 'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls', - 'Solar PV', 'Other measures' - ] - - # We want to aggregate the matches, when we have multiple - aggregated_matches_df = [] - for org_ref, mapped_matches in matches_df.groupby("Organisation Reference"): - - measures = coordinated_packages[ - ( - coordinated_packages["Organisation Reference"].isin( - mapped_matches['Best Match Organisation Reference'].values - ) - ) - ][measures_columns] - - if mapped_matches.shape[0] == 1: - # Get the measures for this property - measures = measures.squeeze() - - aggregated_matches_df.append( - { - "Organisation Reference": org_ref, - "Number of matches": 1, - "Proportion": 100, - "Estimated SAP Rating": mapped_matches["Survey: Current SAP Rating"].values[0], - "Estimated EPC Rating": mapped_matches["Survey: Current EPC Band"].values[0], - "Was Surveyed": mapped_matches["Was Surveyed"].values[0], - **measures - } - ) - continue - - # We need to aggregate the matches, since we have multiple - average_rating = mapped_matches["Survey: Current SAP Rating"].mean() - number_of_matches = mapped_matches.shape[0] - average_epc_rating = sap_to_epc(average_rating) - # proportion is the number of properties that have this EPC rating - proportion_with_this_epc = int( - mapped_matches[mapped_matches["Survey: Current EPC Band"] == average_epc_rating].shape[ - 0] / number_of_matches * 100 - ) - - measures_aggregated = {} - for m in measures_columns: - if any(~pd.isnull(measures[m])): - # Check if we have 2 unique values - vals = measures[~pd.isnull(measures[m])][m].unique() - if len(vals) > 1: - measures_aggregated[m] = ", ".join(vals) - else: - measures_aggregated[m] = vals[0] - - aggregated_matches_df.append( - { - "Organisation Reference": org_ref, - "Number of matches": number_of_matches, - "Proportion": proportion_with_this_epc, - "Estimated SAP Rating": average_rating, - "Estimated EPC Rating": average_epc_rating, - "Was Surveyed": False, - **measures_aggregated - } - ) - - aggregated_matches_df = pd.DataFrame(aggregated_matches_df) - - mapped_priority_list = new_priority_postcodes.merge( - aggregated_matches_df, on="Organisation Reference", how="left" - ) - - mapped_priority_list["address1"] = mapped_priority_list["Address"].str.split(",").str[0] - - # If we have a leading number like 01, 02, 03, 04, 05, 06, 07, 08, 09, we remove the leading 0 - - def remove_leading_zero(address): - return re.sub(r"^0([1-9]) ", r"\1 ", address) - - mapped_priority_list["address1"] = mapped_priority_list["address1"].apply(remove_leading_zero) - mapped_priority_list["address1"] = np.where( - mapped_priority_list["Organisation Reference"] == 37004, - "8 Mason Road", - mapped_priority_list["address1"] - ) - mapped_priority_list["address1"] = np.where( - mapped_priority_list["Organisation Reference"] == 37003, - "9 Mason Road", - mapped_priority_list["address1"] - ) - - mapped_priority_list = mapped_priority_list.rename( - columns={"UPRN": "uprn"} - ) - mapped_priority_list["row_id"] = mapped_priority_list["Organisation Reference"] - - # Flag where 2 out of the three columns have consensus - mapped_priority_list["2 of 3 Data Sources Have Consensus on EPC"] = ( - (mapped_priority_list["SAP Band"] == mapped_priority_list["EPC Band"]) | - (mapped_priority_list["SAP Band"] == mapped_priority_list["Estimated EPC Rating"]) | - (mapped_priority_list["EPC Band"] == mapped_priority_list["Estimated EPC Rating"]) - ) - - # Let's get the newest EPC data for these properties - # We merge on UPRN, when we have it - # from etl.route_march_data_pull.app import get_data - # epc_data, errors, nodata = get_data( - # asset_list=mapped_priority_list, - # fulladdress_column="Address", - # address1_column="address1", - # postcode_column="Postcode", - # manual_uprn_map={}, - # epc_api_only=True - # ) - # - # epc_df = pd.DataFrame(epc_data) - # epc_df.to_csv( - # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "full_epc_data.csv"), index=False - # ) - epc_df = pd.read_csv(os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "full_epc_data.csv")) - epc_df = epc_df.rename(columns={"row_id": "Organisation Reference"}) - - # We now package up the data - - # Sheet 1 is the base coordination data - output_coordination_sheet = coordinated_packages[ - [ - "Name", "Postcode", 'Organisation Reference', 'Package Ref', - 'Main Wall Insulation', 'Secondary Wall Insulation', 'Loft insulation', - 'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade', - 'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls', - 'Solar PV', 'Other measures', - 'Survey: Current SAP Rating', - 'Survey: Current EPC Band', - 'Survey: Primary Energy Use (kWh/yr)', - 'Survey: Primary Energy Use Intensity (kWh/m2/yr)', - 'Survey: Number of Storeys', 'Survey: Fuel Bill', - 'Survey: Window Age Description', - 'Survey: Window Age Description Proportion (%)', - 'Survey: Secondary Window Age Description', - 'Survey: Secondary Window Age Description Proportion (%)', - 'Survey: Number of Windows', 'Survey: Total Number of Doors', - 'Survey: Number of Insulated Doors', - 'Survey: Existing Primary Heating System', - 'Survey: Existing Primary Heating PCDF Reference', - 'Survey: Existing Primary Heating Controls', - 'Survey: Existing Primary Heating % of Heat', - 'Survey: Existing Secondary Heating System', - 'Survey: Existing Secondary Heating PCDF Reference', - 'Survey: Existing Secondary Heating Controls', - 'Survey: Existing Secondary Heating % of Heat', - 'Survey: Secondary Heating Code', 'Survey: Water Heating Code', - 'Survey: Total Floor Area (m2)', 'Survey: Total Ground Floor Area (m2)', - 'Survey: RIR Floor Area', 'Survey: Main Building Wall Area (m2)', - 'Survey: First Extension Wall Area (m2)', - 'Survey: Number of Light Fittings', 'Survey: Number of LEL Fittings', - 'Survey: Number of fittings needing LEL', 'Survey: Main Roof Type', - 'Survey: Main Roof Insulation', - 'Survey: Main Roof Insulation Thickness', 'Survey: Main Wall Type', - 'Survey: Main Wall Insulation', 'Survey: Main Wall Dry-lining', - 'Survey: Main Wall Thickness', - 'Survey: Main Building Alternative Wall Type', - 'Survey: Main Building Alternative Wall Insulation', - 'Survey: Main Building Alternative Wall Dry-lining', - 'Survey: Main Building Alternative Wall Thickness', - 'Survey: Main Fuel', - 'Survey: Main Building Age Band', - 'Walls', 'Roofs', 'Heating', 'Main Fuel', 'Age', 'Property Type' - ] - ].rename( - columns={ - 'Walls': "Parity - Walls", - 'Roofs': "Parity - Roof", - 'Heating': "Parity - Heating", - 'Main Fuel': "Parity - Fuel", - 'Age': "Parity - Age Band", - 'Property Type': "Parity - Property Type" - } - ) - - # Sheet 2 is the lookup table which maps the properties to their closest match - # We need to bring in the parity attributes between the mapped properties so we can see side-by-side - mapped_lookup = matches_df[ - [ - 'Organisation Reference', - 'Best Match Organisation Reference', - 'Survey: Current EPC Band', - 'Survey: Current SAP Rating', - "Was Surveyed", - "match_confidence", - ] - ].rename( - columns={ - 'Best Match Organisation Reference': "Best Match - Organisation Reference", - "Survey: Current EPC Band": "Best Match - Survey: Current EPC Band", - 'Survey: Current SAP Rating': "Best Match - Survey: Current SAp Rating" - } - ).merge( - features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type", - "Total Floor Area"]], - how="left", - on="Organisation Reference" - ).merge( - features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type", - "Total Floor Area"]].rename( - columns={ - "Organisation Reference": "Best Match - Organisation Reference", - "Walls": "Best Match - Walls", - "Roofs": "Best Match - Roof", - "Heating": "Best Match - Heating", - "Main Fuel": "Best Match - Main Fuel", - "Age": "Best Match - Age", - "Property Type": "Best Match - Property Type", - "Total Floor Area": "Best Match - Total Floor Area" - } - ), - how="left", - on="Best Match - Organisation Reference" - ).merge( - coordinated_packages[ - [ - "Organisation Reference", 'Survey: Main Wall Type', 'Survey: Main Wall Insulation', - 'Survey: Main Roof Type', 'Survey: Main Roof Insulation', 'Survey: Main Roof Insulation Thickness', - 'Survey: Existing Primary Heating System', 'Survey: Main Building Age Band', - 'Survey: Main Building Wall Area (m2)', 'Survey: Total Floor Area (m2)', - 'Survey: Main Building Age Band', - ] - ].rename( - columns={ - "Organisation Reference": "Best Match - Organisation Reference", - 'Survey: Main Wall Type': 'Best Match - Survey: Main Wall Type', - 'Survey: Main Wall Insulation': 'Best Match - Survey: Main Wall Insulation', - 'Survey: Main Roof Type': 'Best Match - Survey: Main Roof Type', - 'Survey: Main Roof Insulation': 'Best Match - Survey: Main Roof Insulation', - 'Survey: Main Roof Insulation Thickness': 'Best Match - Survey: Main Roof Insulation Thickness', - 'Survey: Existing Primary Heating System': 'Best Match - Survey: Existing Primary Heating System', - } - ), - how="left", - on="Best Match - Organisation Reference" - ) - - # Finally, we have the property, against the mapped home with the estimate SAP scores and the EPC data - worksheet = mapped_priority_list[ - [ - 'Organisation Reference', 'Address', 'Postcode', 'Address ID', 'uprn', 'Archetype ID', - 'SAP', 'SAP Band', "Property Type", "Walls", "Roofs", 'Glazing', - 'Heating', 'Main Fuel', 'Hot Water', 'Number of matches', 'Proportion', - 'Estimated SAP Rating', 'Estimated EPC Rating', "Was Surveyed", - 'Main Wall Insulation', - 'Secondary Wall Insulation', 'Loft insulation', 'Flat Roof', - 'Room in Roof', 'Window Upgrade', 'Door Upgrade', 'Ventilation', - 'Main Heating', 'Water Heating', 'Heating Controls', 'Solar PV', - 'Other measures', "2 of 3 Data Sources Have Consensus on EPC" - ] - ].rename( - columns={ - "SAP": "Parity - SAP Rating", - "SAP Band": "Parity - EPC Rating", - "Property Type": "Parity - Property Type", - "Walls": "Parity - Walls", - "Roofs": "Parity - Roofs", - 'Glazing': "Parity - Glazing", - 'Heating': 'Parity - Heating', - 'Main Fuel': 'Parity - Main Fuel', - 'Hot Water': 'Parity - Hot Water', - 'Proportion': 'Proportion of matched properties with same EPC rating', - } - ).merge( - epc_df[ - [ - "Organisation Reference", - "uprn", - "current-energy-efficiency", - "current-energy-rating", - "lodgement-date", - "construction-age-band", - "walls-description", - "roof-description", - "mainheat-description", - "windows-description", - "hotwater-description", - "main-fuel", - "total-floor-area", - ] - ].rename( - columns={ - "uprn": "Last EPC - uprn", - "current-energy-efficiency": "Last EPC - SAP Score", - "current-energy-rating": "Last EPC - EPC Rating", - "lodgement-date": "Last EPC - Date Lodged", - "construction-age-band": "Last EPC - Age Band", - "walls-description": "Last EPC - Walls", - "roof-description": "Last EPC - Roof", - "mainheat-description": "Last EPC - Heating", - "windows-description": "Last EPC - Windows", - "hotwater-description": "Last EPC - Hot Water", - "main-fuel": "Last EPC - Main Fuel", - "total-floor-area": "Last EPC - Total Floor Area" - } - ), - how="left", - on='Organisation Reference' - ) - - worksheet["Years Since Last EPC"] = pd.Timestamp.now().year - pd.to_datetime( - worksheet["Last EPC - Date Lodged"]).dt.year - - worksheet["Last EPC - uprn"] = worksheet["Last EPC - uprn"].astype("Int64").astype(str) - - worksheet["uprn"] = np.where( - pd.isnull(worksheet["uprn"]) & pd.notnull(worksheet["Last EPC - uprn"]), - worksheet["Last EPC - uprn"], - worksheet["uprn"] - ) - - worksheet["uprn"] = worksheet["uprn"].replace("", "") - - worksheet = worksheet.drop(columns=["Last EPC - uprn"]) - - # Save to Excel with multiple sheets - excel_path = os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "13022025 Stonewater Priority List.xlsx") - with pd.ExcelWriter(excel_path, engine="xlsxwriter") as writer: - worksheet.to_excel(writer, sheet_name="Worksheet", index=False, header=True) - mapped_lookup.to_excel(writer, sheet_name="Lookup Table", index=False, header=True) - output_coordination_sheet.to_excel(writer, sheet_name="Coordination", index=False, header=True) - -# if __name__ == "__main__": -# main() diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index 216a14de..766de840 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -681,7 +681,9 @@ class RetrieveFindMyEpc: ], "High heat retention storage heaters and dual rate meter": [ "high_heat_retention_storage_heater" - ] + ], + "Increase loft insulation to 250mm": ["loft_insulation"], + "Solar photovoltaics panels, 25% of roof area": ["solar_pv"], } survey = True