mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Added string similarity to filter addresses
This commit is contained in:
parent
7f8c185bca
commit
01d8e52650
3 changed files with 56 additions and 10 deletions
|
|
@ -3,6 +3,7 @@ import time
|
|||
from epc_api.client import EpcClient
|
||||
from utils.logger import setup_logger
|
||||
from typing import List
|
||||
from fuzzywuzzy import process
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
|
@ -108,7 +109,45 @@ class SearchEpc:
|
|||
"error": str(e)
|
||||
}
|
||||
|
||||
def retrieve(self, property_type=None):
|
||||
@staticmethod
|
||||
def filter_rows(rows, property_type=None, address=None):
|
||||
"""
|
||||
This method should not be used when property_type and address are both not None
|
||||
:param rows:
|
||||
:param property_type:
|
||||
:param address:
|
||||
:return:
|
||||
"""
|
||||
# Given the results from the EPC api, attempts to reduce the number of rows
|
||||
uprns = {r["uprn"] for r in rows}
|
||||
|
||||
if (property_type is None) and (address is None):
|
||||
return rows
|
||||
|
||||
if len(uprns) == 1:
|
||||
return rows
|
||||
|
||||
logger.error("Multiple UPRNS found - we should use an alternate method of searching - TODO")
|
||||
if property_type is not None:
|
||||
# We can do a filter on the property type
|
||||
rows_filtered = [r for r in rows if r["property-type"] == property_type]
|
||||
|
||||
if rows_filtered:
|
||||
return rows_filtered
|
||||
|
||||
return rows
|
||||
|
||||
if address is not None:
|
||||
# We can do a filter on the property type
|
||||
best_match = process.extractOne(address, [r["address"] for r in rows], score_cutoff=0)
|
||||
rows_filtered = [r for r in rows if r["address"] == best_match[0]]
|
||||
|
||||
if rows_filtered:
|
||||
return rows_filtered
|
||||
|
||||
return rows
|
||||
|
||||
def retrieve(self, property_type=None, address=None):
|
||||
|
||||
"""
|
||||
Given a successful search, this method will format the data and return it
|
||||
|
|
@ -123,15 +162,9 @@ class SearchEpc:
|
|||
# We perform some checks on the rows
|
||||
# Firstly, we should only have 1 urpn so if we have multiple, we'll need to filter down the
|
||||
# property further
|
||||
uprns = {r["uprn"] for r in rows}
|
||||
|
||||
if len(uprns) != 1:
|
||||
logger.error("Multiple UPRNS found - we should use an alternate method of searching - TODO")
|
||||
if property_type is not None:
|
||||
# We can do a filter on the property type
|
||||
rows_filtered = [r for r in rows if r["property-type"] == property_type]
|
||||
if rows_filtered:
|
||||
rows = rows_filtered
|
||||
rows = self.filter_rows(rows, property_type=property_type, address=None)
|
||||
rows = self.filter_rows(rows, property_type=None, address=address)
|
||||
|
||||
# We now check for a full sap epc:
|
||||
full_sap_epc = [r for r in rows if r["transaction-type"] == "new dwelling"]
|
||||
|
|
|
|||
|
|
@ -47,6 +47,8 @@ def load_ha_33():
|
|||
|
||||
|
||||
def standardise_ha33(data):
|
||||
data = data[~pd.isnull(data["ADDRESS"])]
|
||||
|
||||
split_addresses = data['ADDRESS'].str.split(',', expand=True)
|
||||
split_addresses.columns = ['address1', 'address2', 'address3', 'address4', 'address5']
|
||||
|
||||
|
|
@ -103,7 +105,8 @@ def get_ha_33data(data, cleaned, cleaning_data, created_at):
|
|||
continue
|
||||
|
||||
newest_epc, older_epcs, _ = searcher.retrieve(
|
||||
property_type=house_type_lookup.get(house["PROPERTY TYPE"], None)
|
||||
property_type=house_type_lookup.get(house["PROPERTY TYPE"], None),
|
||||
address=house["ADDRESS"],
|
||||
)
|
||||
|
||||
eligibility = Eligibility(epc=newest_epc, cleaned=cleaned)
|
||||
|
|
@ -145,6 +148,14 @@ def get_ha_33data(data, cleaned, cleaning_data, created_at):
|
|||
}
|
||||
)
|
||||
|
||||
# import pickle
|
||||
# with open("ha33_results.pickle", "wb") as f:
|
||||
# pickle.dump({
|
||||
# "results": results,
|
||||
# "scoring_data": scoring_data,
|
||||
# "nodata": nodata
|
||||
# }, f)
|
||||
|
||||
return results, scoring_data, nodata
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -7,3 +7,5 @@ python-dotenv
|
|||
boto3
|
||||
textblob
|
||||
pyarrow==12.0.1
|
||||
fuzzywuzzy
|
||||
python-Levenshtein
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue