Added string similarity to filter addresses

2026-07-27 23:35:01 +00:00 · 2023-12-15 15:46:52 +00:00 · 2023-12-15 15:46:52 +00:00 · 01d8e52650
commit 01d8e52650
parent 7f8c185bca
3 changed files with 56 additions and 10 deletions
--- a/backend/SearchEpc.py
+++ b/backend/SearchEpc.py
@ -3,6 +3,7 @@ import time
 from epc_api.client import EpcClient
 from utils.logger import setup_logger
 from typing import List
+from fuzzywuzzy import process

 logger = setup_logger()

@ -108,7 +109,45 @@ class SearchEpc:
                        "error": str(e)
                    }

-    def retrieve(self, property_type=None):
+    @staticmethod
+    def filter_rows(rows, property_type=None, address=None):
+        """
+        This method should not be used when property_type and address are both not None
+        :param rows:
+        :param property_type:
+        :param address:
+        :return:
+        """
+        # Given the results from the EPC api, attempts to reduce the number of rows
+        uprns = {r["uprn"] for r in rows}
+
+        if (property_type is None) and (address is None):
+            return rows
+
+        if len(uprns) == 1:
+            return rows
+
+        logger.error("Multiple UPRNS found - we should use an alternate method of searching - TODO")
+        if property_type is not None:
+            # We can do a filter on the property type
+            rows_filtered = [r for r in rows if r["property-type"] == property_type]
+
+            if rows_filtered:
+                return rows_filtered
+
+            return rows
+
+        if address is not None:
+            # We can do a filter on the property type
+            best_match = process.extractOne(address, [r["address"] for r in rows], score_cutoff=0)
+            rows_filtered = [r for r in rows if r["address"] == best_match[0]]
+
+            if rows_filtered:
+                return rows_filtered
+
+            return rows
+
+    def retrieve(self, property_type=None, address=None):

        """
        Given a successful search, this method will format the data and return it
@ -123,15 +162,9 @@ class SearchEpc:
        # We perform some checks on the rows
        # Firstly, we should only have 1 urpn so if we have multiple, we'll need to filter down the
        # property further
-        uprns = {r["uprn"] for r in rows}

-        if len(uprns) != 1:
-            logger.error("Multiple UPRNS found - we should use an alternate method of searching - TODO")
-            if property_type is not None:
-                # We can do a filter on the property type
-                rows_filtered = [r for r in rows if r["property-type"] == property_type]
-                if rows_filtered:
-                    rows = rows_filtered
+        rows = self.filter_rows(rows, property_type=property_type, address=None)
+        rows = self.filter_rows(rows, property_type=None, address=address)

        # We now check for a full sap epc:
        full_sap_epc = [r for r in rows if r["transaction-type"] == "new dwelling"]
--- a/etl/eligibility/ha_15_32/ha33_app.py
+++ b/etl/eligibility/ha_15_32/ha33_app.py
@ -47,6 +47,8 @@ def load_ha_33():


 def standardise_ha33(data):
+    data = data[~pd.isnull(data["ADDRESS"])]
+
    split_addresses = data['ADDRESS'].str.split(',', expand=True)
    split_addresses.columns = ['address1', 'address2', 'address3', 'address4', 'address5']

@ -103,7 +105,8 @@ def get_ha_33data(data, cleaned, cleaning_data, created_at):
            continue

        newest_epc, older_epcs, _ = searcher.retrieve(
-            property_type=house_type_lookup.get(house["PROPERTY TYPE"], None)
+            property_type=house_type_lookup.get(house["PROPERTY TYPE"], None),
+            address=house["ADDRESS"],
        )

        eligibility = Eligibility(epc=newest_epc, cleaned=cleaned)
@ -145,6 +148,14 @@ def get_ha_33data(data, cleaned, cleaning_data, created_at):
            }
        )

+    # import pickle
+    # with open("ha33_results.pickle", "wb") as f:
+    #     pickle.dump({
+    #         "results": results,
+    #         "scoring_data": scoring_data,
+    #         "nodata": nodata
+    #     }, f)
+
    return results, scoring_data, nodata


--- a/etl/eligibility/ha_15_32/requirements.txt
+++ b/etl/eligibility/ha_15_32/requirements.txt
@ -7,3 +7,5 @@ python-dotenv
 boto3
 textblob
 pyarrow==12.0.1
+fuzzywuzzy
+python-Levenshtein