diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 1a3f6180..fde24fe2 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -1,5 +1,4 @@ -import os -import usaddress +import re import pandas as pd from utils.logger import setup_logger from backend.SearchEpc import SearchEpc @@ -42,6 +41,9 @@ class AssetList: DOMNA_PROPERTY_ID = "domna_property_id" + # Regular expression for identifying if the address might point to multiple units + MULTI_UNIT_REGEX = re.compile(r'\b([A-Za-z0-9]+)-([A-Za-z0-9]+)\b') + def __init__( self, local_filepath, @@ -139,6 +141,14 @@ class AssetList: cleaned = cleaned.rstrip(", ").strip(",").strip() return cleaned + @classmethod + def _identify_multi_address(cls, address): + # We check if the address is comma separated + if "," in address: + address1_section = address.split(",")[0] + # We look for string in the form (x-y) + return bool(cls.MULTI_UNIT_REGEX.search(address1_section)) + def standardise(self): """ This function is used to standardise the asset list @@ -205,4 +215,6 @@ class AssetList: ] ] + # We idenfiy addresses which are likely to be multi-addresses (i.g are rooms x-y) + raise NotImplementedError diff --git a/asset_list/tests/test_standardisation.py b/asset_list/tests/test_standardisation.py index 1a083bbc..b6d9a391 100644 --- a/asset_list/tests/test_standardisation.py +++ b/asset_list/tests/test_standardisation.py @@ -1,12 +1,5 @@ from asset_list.AssetList import AssetList -from backend.SearchEpc import - -def test_address1_extraction(): - example = 'Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL' - - # AssetList._extract_address1( - # example, - # ) - pass +def test_multi_unit_address_flagging(): + assert AssetList._identify_multi_address('Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL')