diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index 37c2b7f9..fd6ea032 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -292,8 +292,7 @@ class SearchEpc: "error": str(e) } - @staticmethod - def filter_rows(rows, property_type=None, address=None): + def filter_rows(self, rows, property_type=None, address=None): """ This method should not be used when property_type and address are both not None :param rows: @@ -321,7 +320,18 @@ class SearchEpc: if address is not None: # We can do a filter on the property type - best_match = process.extractOne(address, [r["address"] for r in rows], score_cutoff=0) + # We check if the full address contains the postcode and if it does, remove + if self.postcode in address: + address = address.replace(self.postcode, "").strip().rstrip(",") + + # We check if post town is included in the address + if any([r["posttown"].lower() in address.lower() for r in rows]): + best_match = process.extractOne( + address, [", ".join([r["address"], r["posttown"]]) for r in rows], score_cutoff=0 + ) + else: + best_match = process.extractOne(address, [r["address"] for r in rows], score_cutoff=0) + # Get all of the scores rows_filtered = [r for r in rows if r["address"] == best_match[0]] if rows_filtered: diff --git a/etl/customers/orbit/archetypes.py b/etl/customers/orbit/archetypes.py index 2a2e0baf..e0f5e995 100644 --- a/etl/customers/orbit/archetypes.py +++ b/etl/customers/orbit/archetypes.py @@ -1,4 +1,24 @@ import pandas as pd +import numpy as np +from backend.SearchEpc import SearchEpc +from dotenv import load_dotenv +from tqdm import tqdm +import os + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + + +def clean_colnames(df): + secondary_cols = ["" if pd.isnull(x) else x for x in df.iloc[0, :].values] + new_colnames = [ + "+".join([df.columns[i], secondary_cols[i]]) if secondary_cols[i] else df.columns[i] + for i, c in enumerate(df.columns) + ] + # Drop row 0 + df = df.drop(0) + df.columns = new_colnames + return df def main(): @@ -8,33 +28,237 @@ def main(): :return: """ + all_locations = [ + "Forest Road Erith", + "Lesney Farms", + "Brook Street 155 - 243", + "Hazel Drive", + "Page Crescent", + "Brook Salmon Roberts and Chapma", + "Beacon Road" + ] + all_assets = pd.read_excel( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Orbit - Wates/Bexley Wave 3 Project - external - " "reduced.xlsx", sheet_name="Full Property List", header=1 ) - - secondary_cols = ["" if pd.isnull(x) else x for x in all_assets.iloc[0, :].values] - new_colnames = [ - "+".join([all_assets.columns[i], secondary_cols[i]]) if secondary_cols[i] else all_assets.columns[i] - for i, c in enumerate(all_assets.columns) - ] - # Drop row 0 + all_assets = clean_colnames(all_assets) + all_assets["Location"] = None locations = { - location_name: pd.read_excel( + location_name: clean_colnames(pd.read_excel( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Orbit - Wates/Bexley Wave 3 Project - external - " - "reduced" - ".xlsx", - sheet_name=location_name - ) for location_name in [ - "Forest Road Erith", - "Lesney Farms", - "Brook Street 155 - 243", - "Hazel Drive", - "Page Crescent", - "Brook Salmon Roberts and Chapma", - "Beacon Road" - ] + "reduced.xlsx", + sheet_name=location_name, + header=1 + )) for location_name in all_locations + } + + for loc in all_locations: + all_assets["Location"] = np.where( + all_assets["Asset Reference"].isin(locations[loc]["Asset Reference"]), + loc, + all_assets["Location"] + ) + + if pd.isnull(all_assets["Location"]).sum(): + raise Exception("something went wrong") + + # 234 properties below EPC C + below_epc_c = all_assets[all_assets["PRE CALCULATED EPC"].isin(["D", "E", "F", "G"])].copy() + + # We simplify wall type + below_epc_c["wall_type_simplified"] = below_epc_c["Wall Type"].str.split(" ").str[0] + + known_no_epc = [ + 28679, # These is no EPC for 11 Page Crescent, Erith, Kent, DA8 2HJ, just 11A + 29291, # No EPC for 225 Slade Green Road, Erith, Kent, DA8 2JW + ] + # Get the EPC data + epc_data = [] + for _, home in tqdm(all_assets.iterrows(), total=len(all_assets)): + if home["Asset Reference"] in known_no_epc: + continue + + address = home["Address"] + # Spelling error + if "Frinstead" in address: + address = address.replace("Frinstead", "Frinsted") + + address1 = address.split(",")[0] + + searcher = SearchEpc( + address1=address1, + postcode=home["Address - Postcode"], + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + full_address=address, + ) + searcher.ordnance_survey_client.property_type = None + searcher.ordnance_survey_client.built_form = None + + searcher.find_property(skip_os=True) + if searcher.newest_epc is None: + raise Exception("Couldn't find") + + epc_data.append( + { + "Asset Reference": home["Asset Reference"], + **searcher.newest_epc.copy() + } + ) + + epc_data = pd.DataFrame(epc_data) + # epc_data.to_csv( + # "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Orbit - Wates/Bexley EPC data.csv", index=False + # ) + + epc_comparison = all_assets[ + ['Asset Reference', 'Address', 'PRE CALCULATED EPC'] + ].merge( + epc_data[["Asset Reference", "current-energy-rating", "lodgement-date"]], + on='Asset Reference', + how="left" + ) + + # There are a large # of properties (147) that have different pre calcualted EPC rating, to what's on the registry + # These may be internally held EPRs but this may inform which properties we might want to prioritise for survey + different_epcs = epc_comparison[ + epc_comparison["PRE CALCULATED EPC"] != epc_comparison["current-energy-rating"] + ] + + not_c = different_epcs[ + (different_epcs["PRE CALCULATED EPC"] == "C") & + (different_epcs["current-energy-rating"] != "C") + ] + + system_builds = below_epc_c[ + below_epc_c["Wall Type"].str.contains("SystemBuilt") + ].copy() + + combinations = system_builds[ + ['Asset Type', 'Property Type', 'Location', 'PRE CALCULATED EPC', 'Wall Type', ] + ].drop_duplicates() + + system_build_data_comparison = system_builds.merge( + epc_data[["Asset Reference", "walls-description", "roof-description", "current-energy-rating"]], + left_on='Asset Reference', + right_on='Asset Reference', + how="left" + ) + + system_build_data_comparison["PRE CALCULATED EPC"].value_counts() + system_build_data_comparison["current-energy-rating"].value_counts() + + epc_cs_system_builds = system_build_data_comparison[system_build_data_comparison["current-energy-rating"] == "C"] + + archetype_columns = [ + ["Asset Type", "Property Type", "Wall Type", "Location"], + ["Asset Type", "Property Type", "Location"], + ["Asset Type", "Property Type", "Wall Type", "Location", "PRE CALCULATED EPC", "roof-description"], + ["Asset Type", "Property Type", "Location", "PRE CALCULATED EPC"] + ] + + summary = [] + for cols in archetype_columns: + combinations = system_build_data_comparison[cols].drop_duplicates() + summary.append( + { + "cols": cols, + "number_archetypes": len(combinations), + } + ) + + summary = pd.DataFrame(summary) + + # Let's use this column combination + chosen_combination = [ + "Asset Type", "Property Type", "Wall Type", "Location", "PRE CALCULATED EPC", "roof-description" + ] + + # For this combination, let's find the properties + archetype_combinations = system_build_data_comparison[chosen_combination].drop_duplicates().reset_index(drop=True) + archetype_combinations["archetype ID"] = archetype_combinations.index + + archetyped_data = system_build_data_comparison.merge( + archetype_combinations, how="left", on=chosen_combination + ) + + counts = archetyped_data["archetype ID"].value_counts() + # Archetype 0: Semi D, Uninsulated system built, Pre calculated EPC D, flat insulated roof, (Lesney-0) + # Archetype 1: Semi D, Externally insulated system built, Pre calculated EPC D, flat insulated roof (Lesney-1) + # Archetype 5: Semi D, System built with unknown insulation, Pre calculated EPC D, flat roof insulated (Lesney-2) + # Archetype 3: Semi D, Externally insulated system built, Pre calculated EPC D, flat roof uninsulated (assumed) ( + # Lesney-3) + # 0 21 + # 1 10 + # 5 10 + # 3 3 + # 2 1 + # 4 1 + # 6 1 + # 7 1 + # 8 1 + # 9 1 + # 10 1 + # 11 1 + + # This archetype is the same as 0, apart from the pre calculate EPC being an E. The registry says this is a D + # This has been added to additonal units + eg1 = archetyped_data[archetyped_data["archetype ID"] == 2] + + # This archetype is the same as 3, apart from it having limited flat roof insulation. + # TODO: The insulation status of this property should be confirmed + eg2 = archetyped_data[archetyped_data["archetype ID"] == 4] + eg2["roof-description"] + z = epc_data[epc_data["Asset Reference"] == eg2["Asset Reference"].values[0]] + + # This is the one mid-terrace - the EPC data indicates that this is Semi-detached + # Otherwise this is archetype 5 + # this should be semi-detached + eg3 = archetyped_data[archetyped_data["archetype ID"] == 6] + eg3_epc_data = epc_data[epc_data["Asset Reference"] == eg3["Asset Reference"].values[0]] + + # This warrants its own archetype + # Semi D, System built with unknown insulation, Pre calculated EPC D, flat uninsulated roof + eg4 = archetyped_data[archetyped_data["archetype ID"] == 7] + + # This property stands out due to the mixed cavity and system built wall, but besides that it's similar to + # archetype 0 + # The latest EPC agrees that this is a mixed wall type but the EPC suggests solid and cavity, with an assumed + # insulated cavity, as built + eg5 = archetyped_data[archetyped_data["archetype ID"] == 8] + + # Archetypes 9, 10, 11 are all similar, Semi D, Uninsulated system built, with pitched lofts with up to 200mm + # insulation in the lofts + eg6 = archetyped_data[archetyped_data["archetype ID"] == 9] + + # It's just the three units + # They're all labelled as + pitched_system_built_properties = archetyped_data[archetyped_data["archetype ID"].isin([9, 10, 11])] + pitched_system_built_properties["Address"] + + notes = [ + { + "Asset Reference": 27445, + "note": "Confirmed this has a pitched roof on Maps" + }, + { + "Asset Reference": 27443, + "note": "Confirmed this has a pitched roof on Maps" + }, + { + "Asset Reference": 27442, + "note": "Confirmed this has a pitched roof on Maps" + }, + { + "Asset Reference": 25847, + "note": "This is labelled as a mid-terrace but the EPC data + Maps suggest it's a semi-detached" + } + ] + + patches = { + 25847: {"Property Type": "Semi Detached House", "archetype ID": 5}, }