import os from tqdm import tqdm from dotenv import load_dotenv import pandas as pd import numpy as np import msgpack from utils.s3 import read_from_s3 from backend.SearchEpc import SearchEpc from etl.spatial.OpenUprnClient import OpenUprnClient load_dotenv(dotenv_path="backend/.env") EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) def app(): # Retrieve EPC data for the SHDF AIHA portfolio data = pd.read_excel( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/Khalim Review - 240902 - KSQ - AIHA - SHDF Wave " "3 bid - Supplementary information.xlsx", sheet_name="All units information", header=3 ) # Remove the .eg row data = data.tail(-1) # Remove the bottom 2 rows data = data.head(-2) data = data.reset_index(drop=True) data["row_id"] = data.index ammendments = { "12 11-18 Schonfeld Square": "12 Schonfeld Square", "35 35-37 Schonfeld Square": "35 Schonfeld Square", '77 Schonfeld Square': '77 Lordship Road', "83 Lordship Road (Schonfeld Square)": "83 Lordship Road", "A 80 Bethune Road": "80A Bethune Road", "86B Bethune Road": "Flat B, 86 Bethune Road", "22 Glendale Road": "22 Glendale Avenue", "121 Southbourne Road": "121 Southbourne Grove", } no_epc = [ "80B Bethune Road", "89B Manor Road", "12 Monkville Avenue", "9 Greenview", ] property_type_map = { "House, mid-terrace": "House", "House, end terrace": "House", "House, semi-detached": "House", "House, detached": "House", "Flat": "Flat", } epc_data = [] epc_metadata = [] for _, home in tqdm(data.iterrows(), total=len(data)): # Build address 1 based on if there is: # 1) Address letter or number # 2) Street address modified = False address1 = "" address1_backup = "" if home["Address letter or number"] in ["A", "B", "C"]: house_no = home['Street address'].split(' ')[0] street = ' '.join(home['Street address'].split(' ')[1:]) address1 = f"{house_no}{home['Address letter or number']} {street}" address1_backup = f"Flat {home['Address letter or number']} {house_no} {street}" modified = True else: if not pd.isnull(home["Address letter or number"]): address1 += f"{home['Address letter or number']} " if not pd.isnull(home["Street address"]): address1 += f"{home['Street address']}" address1 = address1.strip() if address1.split(" ")[-1].lower() == "rd": # Replace with road address1 = address1.lower().replace(" rd", " road") # Specific ammendments if address1 in ammendments: address1 = ammendments[address1] if address1 in no_epc: continue searcher = SearchEpc( address1=address1, postcode=home["Postcode"], auth_token=EPC_AUTH_TOKEN, os_api_key="", property_type=property_type_map[home["Property type"]] ) searcher.find_property(skip_os=True) if searcher.newest_epc is None and modified: searcher = SearchEpc( address1=address1_backup, postcode=home["Postcode"], auth_token=EPC_AUTH_TOKEN, os_api_key="", property_type=property_type_map[home["Property type"]] ) searcher.find_property(skip_os=True) if searcher.newest_epc is None: raise Exception("Not found") epc_data.append( { "row_id": home["row_id"], **searcher.newest_epc } ) searcher.get_metadata() epc_metadata.append( { "row_id": home["row_id"], "address": address1, "postcode": home["Postcode"], **searcher.metadata } ) epc_metadata = pd.DataFrame(epc_metadata) epc_data = pd.DataFrame(epc_data) # Check matched addresses matched_addresses = epc_metadata[["row_id", "address", "postcode"]].copy() matched_addresses = matched_addresses.merge( data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner" ) # We look for differences between the asset list and the EPC data comparison_cols = { "Property type": [ { "epc_col": "property-type", "map": property_type_map }, { "epc_col": "built-form", "map": { "House, mid-terrace": "Mid-Terrace", "House, end terrace": "End-Terrace", "House, semi-detached": "Semi-Detached", "House, detached": "Detached", "Flat": "Flat", } } ], "Energy starting band (EPC)": [ { "epc_col": "current-energy-rating", "map": {} } ], "Wall type": [ { "epc_col": "walls-description", "search_terms": { "solid": "Solid brick", "cavity": "Cavity wall", "solid - internal lining": "Solid brick", } } ], "Roof type": [ { "epc_col": "roof-description", "search_terms": { "pitched": "Pitched", "n/a - (flat above)": "another dwelling above" } } ], "Floor type": [ { "epc_col": "floor-description", "search_terms": { "solid": "Solid", "suspended": "Suspended", "solid - floating floor for services": "Solid" } } ], } import re differences = [] for asset_list_col, list_of_configs in comparison_cols.items(): if asset_list_col in ["Wall type", "Roof type", "Floor type"]: config = list_of_configs[0] # We handle this differently remapped = data[["row_id", asset_list_col]].copy() # Strip the asset list col incase of leading/trailing spaces remapped[asset_list_col] = remapped[asset_list_col].str.strip() remapped[asset_list_col] = remapped[asset_list_col].str.lower() remapped = remapped.merge(epc_data[["row_id", config["epc_col"]]], on="row_id", how="inner") # We do a search term check remapped["Match"] = None for search_term, epc_term in config["search_terms"].items(): if "/" in search_term: escaped_search_term = re.escape(search_term) remapped.loc[remapped[asset_list_col].str.contains(escaped_search_term), "Match"] = ( remapped.loc[ remapped[asset_list_col].str.contains(escaped_search_term), config["epc_col"] ].str.contains(epc_term) ) else: remapped.loc[remapped[asset_list_col].str.contains(search_term), "Match"] = ( remapped.loc[ remapped[asset_list_col].str.contains(search_term), config["epc_col"] ].str.contains(epc_term) ) if pd.isnull(remapped["Match"]).sum(): raise Exception("Not all matched") remapped["Match"] = remapped["Match"].astype(bool) if not all(remapped["Match"]): differences.append( { "Column": asset_list_col, "Differences": remapped[~remapped["Match"]], } ) continue for config in list_of_configs: remapped = data[["row_id", asset_list_col]].copy() if config["map"]: remapped[asset_list_col] = remapped[asset_list_col].map(config["map"]) # Merge on remapped = remapped.merge(epc_data[["row_id", config["epc_col"]]], on="row_id", how="inner") remapped["Match"] = remapped[asset_list_col] == remapped[config["epc_col"]] if not all(remapped["Match"]): differences.append( { "Column": asset_list_col, "Differences": remapped[~remapped["Match"]], } ) # Check for property type property_type_differences = differences[0]["Differences"].copy() property_type_differences = property_type_differences.merge( data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner" ) print(property_type_differences) # Check for built form built_form_differences = differences[1]["Differences"].copy() built_form_differences = built_form_differences[built_form_differences["Property type"] != "Flat"] built_form_differences = built_form_differences.merge( data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner" ) print(built_form_differences) # Check for energy rating energy_rating_differences = differences[2]["Differences"].copy() energy_rating_differences = energy_rating_differences.merge( data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner" ).merge( epc_data[["row_id", "uprn"]], on="row_id", how="inner" ) print(energy_rating_differences) # Check for wall type wall_type_differences = differences[3]["Differences"].copy() wall_type_differences = wall_type_differences.merge( data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner" ).merge( epc_data[["row_id", "uprn"]], on="row_id", how="inner" ) print(wall_type_differences) # Many wall type differences # Check for roof type roof_type_differences = differences[4]["Differences"].copy() roof_type_differences = roof_type_differences.merge( data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner" ).merge( epc_data[["row_id", "uprn"]], on="row_id", how="inner" ) print(roof_type_differences) # Many roof type differences # Check for floor type floor_type_differences = differences[5]["Differences"].copy() floor_type_differences = floor_type_differences.merge( data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner" ).merge( epc_data[["row_id", "uprn"]], on="row_id", how="inner" ) print(floor_type_differences) # Many floor type differences # TODO: 47 Ashtead Road [100021024699] shows solid brick wall on EPC - is probably cavity wall # We have the EPC data. Let's check conservation area/historic/listed building status portfolio_spatial_data = OpenUprnClient.get_spatial_data( epc_data["uprn"].unique().tolist(), bucket_name="retrofit-data-dev" ) portfolio_spatial_data["UPRN"] = portfolio_spatial_data["UPRN"].astype(str) spatial_data = data[["row_id", "Planning constraints"]].merge( epc_data[["row_id", "uprn"]], on="row_id", how="left", ).merge( portfolio_spatial_data[["UPRN", "conservation_status", "is_listed_building", "is_heritage_building"]], left_on="uprn", right_on="UPRN", how="left" ) spatial_data[ (spatial_data["Planning constraints"] == "None") ]["conservation_status"].value_counts() # One property is in a conservation area, that was not picked up in the asset data print(spatial_data[ (spatial_data["Planning constraints"] == "None") & (spatial_data["conservation_status"] == True) ].merge( data[["row_id", "Address letter or number", "Street address", "Postcode"]], on="row_id", how="left" )) # All properties match up apart from one where the asset data indicates it's in a conservation area, however # the sparital data indicates it's not. There do not appear to be any listed/heritage buildings in the portfolio ################################################################ # Draft archetyping ################################################################ cleaned = read_from_s3( s3_file_name="cleaned_epc_data/cleaned.bson", bucket_name="retrofit-data-dev" ) cleaned = msgpack.unpackb(cleaned, raw=False) epc_data = epc_data.merge( pd.DataFrame(cleaned["walls-description"])[ ['original_description', 'is_cavity_wall', 'is_filled_cavity', 'is_solid_brick', 'is_system_built', 'is_timber_frame', 'is_as_built', 'is_assumed', 'insulation_thickness'] ].rename( columns={ "is_solid_brick": "is_solid_brick_wall", "is_system_built": "is_system_built_wall", "is_timber_frame": "is_timber_frame_wall", "is_assumed": "is_assumed_wall", "insulation_thickness": "insulation_thickness_wall" } ), left_on="walls-description", right_on="original_description" ).merge( pd.DataFrame(cleaned["roof-description"])[ [ 'original_description', 'is_pitched', 'is_roof_room', 'is_loft', 'is_flat', 'is_thatched', 'is_at_rafters', 'is_assumed', 'has_dwelling_above', 'insulation_thickness' ] ].rename( columns={ "is_assumed": "is_assumed_roof", } ), left_on="roof-description", right_on="original_description" ).merge( pd.DataFrame(cleaned["floor-description"])[ [ 'original_description', 'is_solid', 'is_suspended', 'is_assumed', 'insulation_thickness' ] ].rename( columns={ "is_assumed": "is_assumed_floor", "insulation_thickness": "insulation_thickness_floor" } ), left_on="floor-description", right_on="original_description" ) archetyping_data = data[ [ "row_id", "Energy starting band (EPC)", "Property type", "Property year built", "Gross internal area (sqm)", "Current heating system type", "Wall type", "Floor type", "Roof type", "Window type", "Location (Floor)", ] ].merge( epc_metadata[["row_id", "floor"]], how="left", on="row_id" ).merge( epc_data[ [ "row_id", "uprn", "current-energy-rating", "property-type", "built-form", "total-floor-area", 'is_cavity_wall', 'is_filled_cavity', 'is_solid_brick_wall', 'is_system_built_wall', 'is_timber_frame_wall', 'is_as_built', 'is_assumed_wall', 'insulation_thickness_wall', 'is_solid', 'is_suspended', 'is_assumed_floor', 'insulation_thickness_floor', 'is_pitched', 'is_roof_room', 'is_loft', 'is_flat', 'is_thatched', 'is_at_rafters', 'is_assumed_roof', 'has_dwelling_above', 'insulation_thickness', "mainheat-description", "local-authority-label" ] ], how="left", on="row_id" ).merge( spatial_data[["row_id", "conservation_status", ]], on="row_id", how="left" ) if archetyping_data.shape[0] != data.shape[0]: raise Exception("Mismatch in data") # We create groups analogous to the Energy Company Obligation # 0 - 72, 73 - 97, 98 - 199, 200+ archetyping_data["Floor_area_category"] = pd.cut( archetyping_data["Gross internal area (sqm)"], bins=[0, 72, 97, 199, 1000], labels=["0-72", "73-97", "98-199", "200+"] ) archetyping_data["Floor_area_category_backup"] = pd.cut( archetyping_data["total-floor-area"].astype(float), bins=[0, 72, 97, 199, 1000], labels=["0-72", "73-97", "98-199", "200+"] ) archetyping_data["Floor_area_category"] = archetyping_data["Floor_area_category"].fillna( archetyping_data["Floor_area_category_backup"] ) archetyping_data["Floor_area_category"] = archetyping_data["Floor_area_category"].astype(str) archetyping_data["Floor_area_category"] = np.where( pd.isnull(archetyping_data["Floor_area_category"]), "Unknown", archetyping_data["Floor_area_category"] ) archetyping_data = archetyping_data.drop(columns=["Floor_area_category_backup"]) archetyping_data["property-type-reduced"] = np.where( archetyping_data["property-type"].isin(["Flat", "Maisionette"]), "Flat/Maisonette", archetyping_data["property-type"] ) archetyping_data["built-form-reduced"] = np.where( archetyping_data["built-form"].isin(["End-Terrace", "Semi-Detached"]), "End-Terrace/Semi-Detached", archetyping_data["built-form"] ) archetyping_data["built-form-reduced"] = np.where( archetyping_data["property-type-reduced"] == "Flat/Maisonette", "Flat/Maisonette", archetyping_data["built-form-reduced"] ) archetyping_data["Wall type"] = np.where( archetyping_data["Wall type"].isin(['Solid ', 'Solid - internal lining ']), "Solid", archetyping_data["Wall type"] ) archetyping_data["Wall type"] = np.where( archetyping_data["Wall type"].isin(['Cavity ', 'cavity ']), "Cavity", archetyping_data["Wall type"] ) # Proposed remaps based on discoveries value_remaps = { # 8 Filey Avenue "100021040744": { "variable": "Property type", "newvalue": "House, mid-terrace", }, # 7 Yetev Lev Court "100021032043": { "variable": "Wall type", "newvalue": "Cavity", }, # 14 Yetev Lev Court "100021032050": { "variable": "Wall type", "newvalue": "Cavity", }, # 23 Yetev Lev Court "100021032059": { "variable": "Wall type", "newvalue": "Cavity", }, # 30 Yetev Lev Court "100021032066": { "variable": "Wall type", "newvalue": "Cavity", }, # 34 Yetev Lev Court "100021032070": { "variable": "Wall type", "newvalue": "Cavity", }, # B 86 Bethune Road "100021026285": { "variable": "Wall type", "newvalue": "Solid", }, # A 80 Bethune Road "100021026277": { "variable": "Wall type", "newvalue": "Solid", }, # 140 Kyverdale Road "100021052262": { "variable": "Property type", "newvalue": "House, mid-terrace", }, # 6 Leabourne Road "100021053799": { "variable": "Wall type", "newvalue": "Solid", }, # 22 Britannia Gardens - needs confirmation # 7 Satanita Road - needs confirmation # 12 Cheltenham Crescent "100011402969": { "variable": "Wall type", "newvalue": "Cavity", }, "100021031752": { "variable": "Roof type", "newvalue": "Room Roof" }, # 79 Craven Park Road "100021169682": { "variable": "Roof type", "newvalue": "Room Roof" }, # 88 Darenth Road "100021036148": { "variable": "Roof type", "newvalue": "Room Roof" }, "100021036165": { "variable": "Roof type", "newvalue": "Room Roof" }, "100021036167": { "variable": "Roof type", "newvalue": "Room Roof" }, "100021053849": { "variable": "Roof type", "newvalue": "Room Roof" }, "100021054353": { "variable": "Roof type", "newvalue": "Room Roof" }, "100021054560": { "variable": "Roof type", "newvalue": "Room Roof" }, "100021059839": { "variable": "Roof type", "newvalue": "Room Roof" }, "100021059848": { "variable": "Roof type", "newvalue": "Room Roof" } } # Perform the remaps for uprn, config in value_remaps.items(): archetyping_data[config["variable"]] = np.where( archetyping_data["uprn"].astype(str) == uprn, config["newvalue"], archetyping_data[config["variable"]] ) # row_id = data[ # # (data["Address letter or number"] == "C") & # (data["Street address"].str.strip() == "41 Moresby Road") # ]["row_id"] # if len(row_id) != 1: # raise Exception("Fail") # print(epc_data[epc_data["row_id"] == row_id.values[0]]["uprn"]) # Map the year to the age band def categorize_year(year): if isinstance(year, str): # Handle the case where year is in the format '1930s' if 's' in year: year = int(year[:4]) else: year = int(year) else: year = int(year) # Categorize based on year ranges if year < 1900: return 'A' elif 1900 <= year <= 1929: return 'B' elif 1930 <= year <= 1949: return 'C' elif 1950 <= year <= 1966: return 'D' elif 1967 <= year <= 1975: return 'E' elif 1976 <= year <= 1982: return 'F' elif 1983 <= year <= 1990: return 'G' elif 1991 <= year <= 1995: return 'H' elif 1996 <= year <= 2002: return 'I' elif 2003 <= year <= 2006: return 'J' elif 2007 <= year <= 2011: return 'K' else: # year >= 2012 return 'L' archetyping_data["SAP_age_band"] = archetyping_data["Property year built"].apply( categorize_year ) # Flag if the property is in London/Manchester archetyping_data["Location"] = np.where( archetyping_data["local-authority-label"].isin( ["Hackney", "Barnet", "Haringey"] ), "London", np.where( archetyping_data["local-authority-label"].isin( ["Salford", "Bury"] ), "Manchester", "Southend" ) ) # 9 Greenview is in manchester archetyping_data["Location"] = np.where( archetyping_data["row_id"] == data[data["Street address"] == "9 Greenview"]["row_id"].values[0], "Manchester", archetyping_data["Location"] ) # We fix the location for B 80 Bethune Road archetyping_data["Location"] = np.where( ( archetyping_data["row_id"].isin( data[ data["Street address"] == "80 Bethune Road" ]["row_id"].values.tolist() ) ) & ( archetyping_data["row_id"].isin( data[ data["Address letter or number"] == "B" ]["row_id"].values.tolist() ) ), "London", archetyping_data["Location"] ) # Hackney 73 - London # Southend-on-Sea 6 - Southend # Barnet 4 - London # Castle Point 4 - Southend # Haringey 3 - London # Salford 2 - Manchester # Bury 1 - Manchester primary_archetyping_cols = [ 'Property type', "Location (Floor)", 'Current heating system type', 'Wall type', 'Roof type', # "Location", # 'current-energy-rating', 'property-type-reduced', 'built-form-reduced', 'is_cavity_wall', # 'is_solid_brick_wall', 'is_system_built_wall', 'is_timber_frame_wall', 'is_as_built', # 'is_solid', 'is_roof_room', # 'is_loft', 'is_flat', 'is_thatched', # 'is_at_rafters', 'has_dwelling_above', # 'conservation_status', ] secondary_cols = [ 'SAP_age_band', 'is_filled_cavity', 'insulation_thickness_wall' 'insulation_thickness_floor' 'insulation_thickness', 'is_assumed_wall', 'is_assumed_roof', 'Floor_area_category' ] archetypes = archetyping_data[primary_archetyping_cols].drop_duplicates() # Hash the variables archetypes["archetype_hash"] = archetypes.apply( lambda x: hash(tuple(x.values)), axis=1 ) archetypes = archetypes.sort_values("archetype_hash", ascending=True) archetypes = archetypes.reset_index(drop=True) archetypes["archetype_id"] = archetypes.index archetypes.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/basic-archetypes.csv", index=False) # We match properties to archetypes archetyping_data = archetyping_data.merge( archetypes, on=primary_archetyping_cols, how="left" ) # We should choose a representative property for each archetype archetyping_data = archetyping_data.merge( epc_metadata[["row_id", "days_since_last_epc"]], how="left", on="row_id" ) # Mark the property with the oldest EPC as the representative property representative_properties = archetyping_data.sort_values( ["archetype_id", "days_since_last_epc"], ascending=[True, False] ).drop_duplicates("archetype_id") archetyping_data["for_sample"] = np.where( archetyping_data["row_id"].isin(representative_properties["row_id"]), True, False ) # We save the archetyping data archetyping_data.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/archetyping_data.csv", index=False) # Save the EPC data epc_data.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/epc_data.csv", index=False) # Save the spatial data spatial_data = data[["row_id", "Address letter or number", "Street address", "Postcode"]].merge( spatial_data, on="row_id", how="left" ) spatial_data.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/spatial_data.csv", index=False) # Save archetyping data archetyping_data = data[["row_id", "Address letter or number", "Street address", "Postcode"]].merge( archetyping_data, on="row_id", how="left" ) archetyping_data.to_csv( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/archetyping_data.csv", index=False )