improving SearchEpc matching algorithm

This commit is contained in:
Khalim Conn-Kowlessar 2024-08-27 14:43:18 +01:00
parent f122ae3269
commit 4c71342cfb
2 changed files with 257 additions and 23 deletions

View file

@ -292,8 +292,7 @@ class SearchEpc:
"error": str(e)
}
@staticmethod
def filter_rows(rows, property_type=None, address=None):
def filter_rows(self, rows, property_type=None, address=None):
"""
This method should not be used when property_type and address are both not None
:param rows:
@ -321,7 +320,18 @@ class SearchEpc:
if address is not None:
# We can do a filter on the property type
best_match = process.extractOne(address, [r["address"] for r in rows], score_cutoff=0)
# We check if the full address contains the postcode and if it does, remove
if self.postcode in address:
address = address.replace(self.postcode, "").strip().rstrip(",")
# We check if post town is included in the address
if any([r["posttown"].lower() in address.lower() for r in rows]):
best_match = process.extractOne(
address, [", ".join([r["address"], r["posttown"]]) for r in rows], score_cutoff=0
)
else:
best_match = process.extractOne(address, [r["address"] for r in rows], score_cutoff=0)
# Get all of the scores
rows_filtered = [r for r in rows if r["address"] == best_match[0]]
if rows_filtered:

View file

@ -1,4 +1,24 @@
import pandas as pd
import numpy as np
from backend.SearchEpc import SearchEpc
from dotenv import load_dotenv
from tqdm import tqdm
import os
load_dotenv(dotenv_path="backend/.env")
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
def clean_colnames(df):
secondary_cols = ["" if pd.isnull(x) else x for x in df.iloc[0, :].values]
new_colnames = [
"+".join([df.columns[i], secondary_cols[i]]) if secondary_cols[i] else df.columns[i]
for i, c in enumerate(df.columns)
]
# Drop row 0
df = df.drop(0)
df.columns = new_colnames
return df
def main():
@ -8,33 +28,237 @@ def main():
:return:
"""
all_locations = [
"Forest Road Erith",
"Lesney Farms",
"Brook Street 155 - 243",
"Hazel Drive",
"Page Crescent",
"Brook Salmon Roberts and Chapma",
"Beacon Road"
]
all_assets = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Orbit - Wates/Bexley Wave 3 Project - external - "
"reduced.xlsx",
sheet_name="Full Property List",
header=1
)
secondary_cols = ["" if pd.isnull(x) else x for x in all_assets.iloc[0, :].values]
new_colnames = [
"+".join([all_assets.columns[i], secondary_cols[i]]) if secondary_cols[i] else all_assets.columns[i]
for i, c in enumerate(all_assets.columns)
]
# Drop row 0
all_assets = clean_colnames(all_assets)
all_assets["Location"] = None
locations = {
location_name: pd.read_excel(
location_name: clean_colnames(pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Orbit - Wates/Bexley Wave 3 Project - external - "
"reduced"
".xlsx",
sheet_name=location_name
) for location_name in [
"Forest Road Erith",
"Lesney Farms",
"Brook Street 155 - 243",
"Hazel Drive",
"Page Crescent",
"Brook Salmon Roberts and Chapma",
"Beacon Road"
]
"reduced.xlsx",
sheet_name=location_name,
header=1
)) for location_name in all_locations
}
for loc in all_locations:
all_assets["Location"] = np.where(
all_assets["Asset Reference"].isin(locations[loc]["Asset Reference"]),
loc,
all_assets["Location"]
)
if pd.isnull(all_assets["Location"]).sum():
raise Exception("something went wrong")
# 234 properties below EPC C
below_epc_c = all_assets[all_assets["PRE CALCULATED EPC"].isin(["D", "E", "F", "G"])].copy()
# We simplify wall type
below_epc_c["wall_type_simplified"] = below_epc_c["Wall Type"].str.split(" ").str[0]
known_no_epc = [
28679, # These is no EPC for 11 Page Crescent, Erith, Kent, DA8 2HJ, just 11A
29291, # No EPC for 225 Slade Green Road, Erith, Kent, DA8 2JW
]
# Get the EPC data
epc_data = []
for _, home in tqdm(all_assets.iterrows(), total=len(all_assets)):
if home["Asset Reference"] in known_no_epc:
continue
address = home["Address"]
# Spelling error
if "Frinstead" in address:
address = address.replace("Frinstead", "Frinsted")
address1 = address.split(",")[0]
searcher = SearchEpc(
address1=address1,
postcode=home["Address - Postcode"],
auth_token=EPC_AUTH_TOKEN,
os_api_key="",
full_address=address,
)
searcher.ordnance_survey_client.property_type = None
searcher.ordnance_survey_client.built_form = None
searcher.find_property(skip_os=True)
if searcher.newest_epc is None:
raise Exception("Couldn't find")
epc_data.append(
{
"Asset Reference": home["Asset Reference"],
**searcher.newest_epc.copy()
}
)
epc_data = pd.DataFrame(epc_data)
# epc_data.to_csv(
# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Orbit - Wates/Bexley EPC data.csv", index=False
# )
epc_comparison = all_assets[
['Asset Reference', 'Address', 'PRE CALCULATED EPC']
].merge(
epc_data[["Asset Reference", "current-energy-rating", "lodgement-date"]],
on='Asset Reference',
how="left"
)
# There are a large # of properties (147) that have different pre calcualted EPC rating, to what's on the registry
# These may be internally held EPRs but this may inform which properties we might want to prioritise for survey
different_epcs = epc_comparison[
epc_comparison["PRE CALCULATED EPC"] != epc_comparison["current-energy-rating"]
]
not_c = different_epcs[
(different_epcs["PRE CALCULATED EPC"] == "C") &
(different_epcs["current-energy-rating"] != "C")
]
system_builds = below_epc_c[
below_epc_c["Wall Type"].str.contains("SystemBuilt")
].copy()
combinations = system_builds[
['Asset Type', 'Property Type', 'Location', 'PRE CALCULATED EPC', 'Wall Type', ]
].drop_duplicates()
system_build_data_comparison = system_builds.merge(
epc_data[["Asset Reference", "walls-description", "roof-description", "current-energy-rating"]],
left_on='Asset Reference',
right_on='Asset Reference',
how="left"
)
system_build_data_comparison["PRE CALCULATED EPC"].value_counts()
system_build_data_comparison["current-energy-rating"].value_counts()
epc_cs_system_builds = system_build_data_comparison[system_build_data_comparison["current-energy-rating"] == "C"]
archetype_columns = [
["Asset Type", "Property Type", "Wall Type", "Location"],
["Asset Type", "Property Type", "Location"],
["Asset Type", "Property Type", "Wall Type", "Location", "PRE CALCULATED EPC", "roof-description"],
["Asset Type", "Property Type", "Location", "PRE CALCULATED EPC"]
]
summary = []
for cols in archetype_columns:
combinations = system_build_data_comparison[cols].drop_duplicates()
summary.append(
{
"cols": cols,
"number_archetypes": len(combinations),
}
)
summary = pd.DataFrame(summary)
# Let's use this column combination
chosen_combination = [
"Asset Type", "Property Type", "Wall Type", "Location", "PRE CALCULATED EPC", "roof-description"
]
# For this combination, let's find the properties
archetype_combinations = system_build_data_comparison[chosen_combination].drop_duplicates().reset_index(drop=True)
archetype_combinations["archetype ID"] = archetype_combinations.index
archetyped_data = system_build_data_comparison.merge(
archetype_combinations, how="left", on=chosen_combination
)
counts = archetyped_data["archetype ID"].value_counts()
# Archetype 0: Semi D, Uninsulated system built, Pre calculated EPC D, flat insulated roof, (Lesney-0)
# Archetype 1: Semi D, Externally insulated system built, Pre calculated EPC D, flat insulated roof (Lesney-1)
# Archetype 5: Semi D, System built with unknown insulation, Pre calculated EPC D, flat roof insulated (Lesney-2)
# Archetype 3: Semi D, Externally insulated system built, Pre calculated EPC D, flat roof uninsulated (assumed) (
# Lesney-3)
# 0 21
# 1 10
# 5 10
# 3 3
# 2 1
# 4 1
# 6 1
# 7 1
# 8 1
# 9 1
# 10 1
# 11 1
# This archetype is the same as 0, apart from the pre calculate EPC being an E. The registry says this is a D
# This has been added to additonal units
eg1 = archetyped_data[archetyped_data["archetype ID"] == 2]
# This archetype is the same as 3, apart from it having limited flat roof insulation.
# TODO: The insulation status of this property should be confirmed
eg2 = archetyped_data[archetyped_data["archetype ID"] == 4]
eg2["roof-description"]
z = epc_data[epc_data["Asset Reference"] == eg2["Asset Reference"].values[0]]
# This is the one mid-terrace - the EPC data indicates that this is Semi-detached
# Otherwise this is archetype 5
# this should be semi-detached
eg3 = archetyped_data[archetyped_data["archetype ID"] == 6]
eg3_epc_data = epc_data[epc_data["Asset Reference"] == eg3["Asset Reference"].values[0]]
# This warrants its own archetype
# Semi D, System built with unknown insulation, Pre calculated EPC D, flat uninsulated roof
eg4 = archetyped_data[archetyped_data["archetype ID"] == 7]
# This property stands out due to the mixed cavity and system built wall, but besides that it's similar to
# archetype 0
# The latest EPC agrees that this is a mixed wall type but the EPC suggests solid and cavity, with an assumed
# insulated cavity, as built
eg5 = archetyped_data[archetyped_data["archetype ID"] == 8]
# Archetypes 9, 10, 11 are all similar, Semi D, Uninsulated system built, with pitched lofts with up to 200mm
# insulation in the lofts
eg6 = archetyped_data[archetyped_data["archetype ID"] == 9]
# It's just the three units
# They're all labelled as
pitched_system_built_properties = archetyped_data[archetyped_data["archetype ID"].isin([9, 10, 11])]
pitched_system_built_properties["Address"]
notes = [
{
"Asset Reference": 27445,
"note": "Confirmed this has a pitched roof on Maps"
},
{
"Asset Reference": 27443,
"note": "Confirmed this has a pitched roof on Maps"
},
{
"Asset Reference": 27442,
"note": "Confirmed this has a pitched roof on Maps"
},
{
"Asset Reference": 25847,
"note": "This is labelled as a mid-terrace but the EPC data + Maps suggest it's a semi-detached"
}
]
patches = {
25847: {"Property Type": "Semi Detached House", "archetype ID": 5},
}