Added string similarity to filter addresses

This commit is contained in:
Khalim Conn-Kowlessar 2023-12-15 15:46:52 +00:00
parent 7f8c185bca
commit 01d8e52650
3 changed files with 56 additions and 10 deletions

View file

@ -3,6 +3,7 @@ import time
from epc_api.client import EpcClient
from utils.logger import setup_logger
from typing import List
from fuzzywuzzy import process
logger = setup_logger()
@ -108,7 +109,45 @@ class SearchEpc:
"error": str(e)
}
def retrieve(self, property_type=None):
@staticmethod
def filter_rows(rows, property_type=None, address=None):
"""
This method should not be used when property_type and address are both not None
:param rows:
:param property_type:
:param address:
:return:
"""
# Given the results from the EPC api, attempts to reduce the number of rows
uprns = {r["uprn"] for r in rows}
if (property_type is None) and (address is None):
return rows
if len(uprns) == 1:
return rows
logger.error("Multiple UPRNS found - we should use an alternate method of searching - TODO")
if property_type is not None:
# We can do a filter on the property type
rows_filtered = [r for r in rows if r["property-type"] == property_type]
if rows_filtered:
return rows_filtered
return rows
if address is not None:
# We can do a filter on the property type
best_match = process.extractOne(address, [r["address"] for r in rows], score_cutoff=0)
rows_filtered = [r for r in rows if r["address"] == best_match[0]]
if rows_filtered:
return rows_filtered
return rows
def retrieve(self, property_type=None, address=None):
"""
Given a successful search, this method will format the data and return it
@ -123,15 +162,9 @@ class SearchEpc:
# We perform some checks on the rows
# Firstly, we should only have 1 urpn so if we have multiple, we'll need to filter down the
# property further
uprns = {r["uprn"] for r in rows}
if len(uprns) != 1:
logger.error("Multiple UPRNS found - we should use an alternate method of searching - TODO")
if property_type is not None:
# We can do a filter on the property type
rows_filtered = [r for r in rows if r["property-type"] == property_type]
if rows_filtered:
rows = rows_filtered
rows = self.filter_rows(rows, property_type=property_type, address=None)
rows = self.filter_rows(rows, property_type=None, address=address)
# We now check for a full sap epc:
full_sap_epc = [r for r in rows if r["transaction-type"] == "new dwelling"]

View file

@ -47,6 +47,8 @@ def load_ha_33():
def standardise_ha33(data):
data = data[~pd.isnull(data["ADDRESS"])]
split_addresses = data['ADDRESS'].str.split(',', expand=True)
split_addresses.columns = ['address1', 'address2', 'address3', 'address4', 'address5']
@ -103,7 +105,8 @@ def get_ha_33data(data, cleaned, cleaning_data, created_at):
continue
newest_epc, older_epcs, _ = searcher.retrieve(
property_type=house_type_lookup.get(house["PROPERTY TYPE"], None)
property_type=house_type_lookup.get(house["PROPERTY TYPE"], None),
address=house["ADDRESS"],
)
eligibility = Eligibility(epc=newest_epc, cleaned=cleaned)
@ -145,6 +148,14 @@ def get_ha_33data(data, cleaned, cleaning_data, created_at):
}
)
# import pickle
# with open("ha33_results.pickle", "wb") as f:
# pickle.dump({
# "results": results,
# "scoring_data": scoring_data,
# "nodata": nodata
# }, f)
return results, scoring_data, nodata

View file

@ -7,3 +7,5 @@ python-dotenv
boto3
textblob
pyarrow==12.0.1
fuzzywuzzy
python-Levenshtein