From 01d8e526508f815181c1548ded3bdae20e5d6b41 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 15 Dec 2023 15:46:52 +0000 Subject: [PATCH] Added string similarity to filter addresses --- backend/SearchEpc.py | 51 +++++++++++++++++++---- etl/eligibility/ha_15_32/ha33_app.py | 13 +++++- etl/eligibility/ha_15_32/requirements.txt | 2 + 3 files changed, 56 insertions(+), 10 deletions(-) diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index bff204c5..16c2a8c8 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -3,6 +3,7 @@ import time from epc_api.client import EpcClient from utils.logger import setup_logger from typing import List +from fuzzywuzzy import process logger = setup_logger() @@ -108,7 +109,45 @@ class SearchEpc: "error": str(e) } - def retrieve(self, property_type=None): + @staticmethod + def filter_rows(rows, property_type=None, address=None): + """ + This method should not be used when property_type and address are both not None + :param rows: + :param property_type: + :param address: + :return: + """ + # Given the results from the EPC api, attempts to reduce the number of rows + uprns = {r["uprn"] for r in rows} + + if (property_type is None) and (address is None): + return rows + + if len(uprns) == 1: + return rows + + logger.error("Multiple UPRNS found - we should use an alternate method of searching - TODO") + if property_type is not None: + # We can do a filter on the property type + rows_filtered = [r for r in rows if r["property-type"] == property_type] + + if rows_filtered: + return rows_filtered + + return rows + + if address is not None: + # We can do a filter on the property type + best_match = process.extractOne(address, [r["address"] for r in rows], score_cutoff=0) + rows_filtered = [r for r in rows if r["address"] == best_match[0]] + + if rows_filtered: + return rows_filtered + + return rows + + def retrieve(self, property_type=None, address=None): """ Given a successful search, this method will format the data and return it @@ -123,15 +162,9 @@ class SearchEpc: # We perform some checks on the rows # Firstly, we should only have 1 urpn so if we have multiple, we'll need to filter down the # property further - uprns = {r["uprn"] for r in rows} - if len(uprns) != 1: - logger.error("Multiple UPRNS found - we should use an alternate method of searching - TODO") - if property_type is not None: - # We can do a filter on the property type - rows_filtered = [r for r in rows if r["property-type"] == property_type] - if rows_filtered: - rows = rows_filtered + rows = self.filter_rows(rows, property_type=property_type, address=None) + rows = self.filter_rows(rows, property_type=None, address=address) # We now check for a full sap epc: full_sap_epc = [r for r in rows if r["transaction-type"] == "new dwelling"] diff --git a/etl/eligibility/ha_15_32/ha33_app.py b/etl/eligibility/ha_15_32/ha33_app.py index 41e6ca3f..1f8c15df 100644 --- a/etl/eligibility/ha_15_32/ha33_app.py +++ b/etl/eligibility/ha_15_32/ha33_app.py @@ -47,6 +47,8 @@ def load_ha_33(): def standardise_ha33(data): + data = data[~pd.isnull(data["ADDRESS"])] + split_addresses = data['ADDRESS'].str.split(',', expand=True) split_addresses.columns = ['address1', 'address2', 'address3', 'address4', 'address5'] @@ -103,7 +105,8 @@ def get_ha_33data(data, cleaned, cleaning_data, created_at): continue newest_epc, older_epcs, _ = searcher.retrieve( - property_type=house_type_lookup.get(house["PROPERTY TYPE"], None) + property_type=house_type_lookup.get(house["PROPERTY TYPE"], None), + address=house["ADDRESS"], ) eligibility = Eligibility(epc=newest_epc, cleaned=cleaned) @@ -145,6 +148,14 @@ def get_ha_33data(data, cleaned, cleaning_data, created_at): } ) + # import pickle + # with open("ha33_results.pickle", "wb") as f: + # pickle.dump({ + # "results": results, + # "scoring_data": scoring_data, + # "nodata": nodata + # }, f) + return results, scoring_data, nodata diff --git a/etl/eligibility/ha_15_32/requirements.txt b/etl/eligibility/ha_15_32/requirements.txt index 74fcd97f..99cc8e93 100644 --- a/etl/eligibility/ha_15_32/requirements.txt +++ b/etl/eligibility/ha_15_32/requirements.txt @@ -7,3 +7,5 @@ python-dotenv boto3 textblob pyarrow==12.0.1 +fuzzywuzzy +python-Levenshtein