diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 35da9c3b..1a3f6180 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -17,7 +17,7 @@ class AssetList: "first_two_words", # This method will split on the fist two words, where the separator is a space "first_word", # This method will split on the first word, where the separator is a space "house_number_extraction", # This method will use the NLP model in SearchEPC to extract the housenumber - "address1_extraction" # This method will use the NLP model to extract address1 + # "address1_extraction" # This method will use the NLP model to extract address1 ] STANDARD_PROPERTY_TYPES = [ @@ -29,6 +29,19 @@ class AssetList: "block house", ] + # Standard column Names + STANDARD_ADDRESS_1 = "domna_address_1" + STANDARD_POSTCODE = "domna_postcode" + STANDARD_FULL_ADDRESS = "domna_full_address" + STANDARD_YEAR_BUILT = "domna_year_built" + STANDARD_UPRN = "ordnance_survey_uprn" + STANDARD_PROPERTY_TYPE = "landlord_property_type" + STANDARD_WALL_CONSTRUCTION = "landlord_wall_construction" + STANDARD_HEATING_SYSTEM = "landlord_heating_system" + STANDARD_EXISTING_PV = "landlord_existing_pv" + + DOMNA_PROPERTY_ID = "domna_property_id" + def __init__( self, local_filepath, @@ -36,8 +49,10 @@ class AssetList: address1_colname, postcode_colname, full_address_colname, + landlord_property_id=None, full_address_cols_to_concat=None, missing_postcodes_method=None, + address1_extraction_method=None, landlord_year_built=None, landlord_uprn=None, landlord_property_type=None, @@ -48,14 +63,15 @@ class AssetList: ): self.local_filepath = local_filepath self.sheet_name = sheet_name - self.standardised_asset_list = None # Read in the data self.raw_asset_list = pd.read_excel(local_filepath, header=header, sheet_name=sheet_name) + self.standardised_asset_list = self.raw_asset_list.copy() # We detect the presence of the non-intrusive columns self.non_intrusives_present = True if "CIGA Check Required" in self.raw_asset_list.columns else False # Names of columns + self.landlord_property_id = landlord_property_id self.address1_colname = address1_colname self.postcode_colname = postcode_colname self.full_address_colname = full_address_colname @@ -69,6 +85,7 @@ class AssetList: # parameters for cleaning self.full_address_cols_to_concat = full_address_cols_to_concat self.missing_postcodes_method = missing_postcodes_method + self.address1_extraction_method = address1_extraction_method self.debug_information = { "property_type": None, @@ -77,40 +94,50 @@ class AssetList: "existing_pv": None } - @classmethod - def _extract_address1(cls, asset_list, full_address_col, postcode_col, method="first_two_words"): + def _extract_address1(self, asset_list, full_address_col, postcode_col, method="first_two_words"): - if method not in cls.ADDRESS_1_CLEANING_METHODS: + if method not in self.ADDRESS_1_CLEANING_METHODS: raise ValueError(f"Method {method} for producing address1 not recognized") if method == "first_two_words": - asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ") + asset_list[self.address1_colname] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ") return asset_list if method == "first_word": - asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0] + asset_list[self.address1_colname] = asset_list[full_address_col].str.split(" ").str[0] return asset_list if method == "house_number_extraction": - asset_list["address1_extracted"] = asset_list.apply( + asset_list[self.address1_colname] = asset_list.apply( lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]), axis=1 ) return asset_list - if method == "address1_extraction": - - x = asset_list_df[FULLADDRESS_COLUMN].values[0] - parsed = usaddress.parse(x) - - def extract_address_1(): - - - raise ValueError(f"Method {method} not recognized") + raise ValueError(f"Method {method} not recognized") @staticmethod def _address1_extraction(x): + pass + def create_property_id(self): + """ + This function creates the domna property ID, which is simply a hash of the full address and postcode + We want all figures to be positive + :return: + """ + import sys + self.standardised_asset_list[self.DOMNA_PROPERTY_ID] = ( + self.standardised_asset_list[self.full_address_colname] + self.standardised_asset_list[ + self.postcode_colname] + ).apply(lambda x: hash(x) % 2 ** sys.hash_info.width) + + @staticmethod + def _strip_postcode_from_full_address(full_address, postcode): + cleaned = full_address.replace(postcode, "") + # Remove any trailing commas and spaces + cleaned = cleaned.rstrip(", ").strip(",").strip() + return cleaned def standardise(self): """ @@ -118,15 +145,63 @@ class AssetList: :return: standardised asset list """ - if self.address1_colname is None: - # If we do not have this, we produce it + # Remove rows without a postcode + if self.postcode_colname is not None: + self.standardised_asset_list = self.standardised_asset_list.dropna(subset=[self.postcode_colname]) + # We clean up portential non-breaking spaces, and double spaces + for col in [ + c for c in [self.postcode_colname, self.full_address_colname, self.address1_colname] if + c is not None + ]: + self.standardised_asset_list[col] = self.standardised_asset_list[col].astype(str) + self.standardised_asset_list[col] = self.standardised_asset_list[col].str.replace('\xa0', ' ', regex=False) + self.standardised_asset_list[col] = self.standardised_asset_list[col].str.replace(' ', ' ', regex=False) + + if self.address1_colname is None: + if self.address1_extraction_method is None: + raise ValueError("Missing address 1 - please specify an extraction method") + self.address1_colname = self.STANDARD_ADDRESS_1 + # If we do not have this, we produce it + self.standardised_asset_list = self._extract_address1( + asset_list=self.standardised_asset_list, + full_address_col=self.full_address_colname, + postcode_col=self.postcode_colname, + method=self.address1_extraction_method + ) + + if self.full_address_colname is None: + if not self.full_address_cols_to_concat: + raise ValueError("Missing full address - please specify columns to concatenate") + self.full_address_colname = self.STANDARD_FULL_ADDRESS + self.standardised_asset_list[self.full_address_colname] = ( + self.standardised_asset_list[self.full_address_cols_to_concat].apply(lambda x: ", ".join(x), axis=1) + ) + else: + + # Make sure to strip the postcode out of the full address + self.standardised_asset_list[self.full_address_colname] = self.standardised_asset_list.apply( + lambda x: self._strip_postcode_from_full_address( + full_address=x[self.full_address_colname], + postcode=x[self.postcode_colname] + ), + axis=1 + ) + + # We create the domna property id + self.create_property_id() # We keep just the columns we care about and will work through the various columns and standardise - self.standardised_asset_list = self.raw_asset_list[ + self.standardised_asset_list = self.standardised_asset_list[ [ - self.address1_colname, self.postcode_colname, self.full_address_colname, - self.landlord_year_built, self.landlord_uprn, self.landlord_property_type + self.landlord_property_id, + self.DOMNA_PROPERTY_ID, + self.address1_colname, + self.postcode_colname, + self.full_address_colname, + self.landlord_year_built, + self.landlord_uprn, + self.landlord_property_type, ] ] diff --git a/asset_list/README.md b/asset_list/README.md deleted file mode 100644 index 1bf734a4..00000000 --- a/asset_list/README.md +++ /dev/null @@ -1,172 +0,0 @@ -# libpostal Installation Guide for macOS M1 - -## Overview - -`libpostal` is a fast, open-source address parsing and normalization library, designed for global addresses. This guide -provides detailed steps to install `libpostal` on macOS with Apple Silicon (M1/M2) and use it with Python. - ---- - -## πŸ“Œ Prerequisites - -Before installing `libpostal`, ensure you have the necessary dependencies installed. - -### **1️⃣ Install Required Dependencies** - -Open a terminal and run: - -```bash -brew install curl autoconf automake libtool pkg-config -``` - -### **2️⃣ Clone the libpostal Repository** - -```bash -git clone https://github.com/openvenues/libpostal.git -cd libpostal -``` - -### **3️⃣ Run Bootstrap Script** - -```bash -./bootstrap.sh -``` - -### **4️⃣ Configure the Build (Important for M1 Macs)** - -Since M1 chips do not support SSE2 natively, you **must** disable SSE2 for compatibility. - -```bash -./configure --disable-sse2 --datadir=/usr/local/libpostal_data -``` - -*(You can replace `/usr/local/libpostal_data` with another directory that has a few GB of space.)* - -### **5️⃣ Compile and Install** - -```bash -make -j$(sysctl -n hw.ncpu) -sudo make install -``` - -### **6️⃣ Install Python Bindings** - -Once `libpostal` is installed, install the Python package: - -```bash -pip install postal -``` - ---- - -## βœ… **Verify Installation** - -To check if `libpostal` was installed successfully, run: - -```bash -python -c "import postal; print(postal.parser.parse('23 Clifton Hill, Newtown, Exeter, EX1 2DL'))" -``` - -**Expected Output:** - -``` -[('23', 'house_number'), ('Clifton Hill', 'road'), ('Newtown', 'city'), ('Exeter', 'city'), ('EX1 2DL', 'postcode')] -``` - ---- - -## πŸ“Œ **Usage Example in Python** - -### **Address Parsing** - -```python -from postal.parser import parse - -address = "23 Clifton Hill, Newtown, Exeter, EX1 2DL" -parsed_address = dict(parse(address)) - -print(parsed_address) -``` - -**Expected Output:** - -```python -{ - 'house_number': '23', - 'road': 'Clifton Hill', - 'city': 'Newtown', - 'city': 'Exeter', - 'postcode': 'EX1 2DL' -} -``` - -### **Address Normalization** - -```python -from postal.normalize import normalize_string - -address = "Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL" -normalized = normalize_string(address) - -print(normalized) -``` - ---- - -## πŸ“Œ **Troubleshooting** - -### **1️⃣ libpostal Not Found?** - -If you encounter an error like `ModuleNotFoundError: No module named 'postal'`, make sure: - -- You ran `sudo make install` -- Your Python environment recognizes `postal`. Try: - ```bash - pip install postal --no-cache-dir - ``` -- If using a virtual environment (`venv`), activate it before running Python. - -### **2️⃣ Compilation Issues on macOS?** - -If `make` fails, try running: - -```bash -brew reinstall autoconf automake libtool pkg-config -``` - -Then restart the installation process. - -### **3️⃣ Can't Find libpostal Data Directory?** - -Ensure `libpostal_data` exists in the correct directory: - -```bash -ls /usr/local/libpostal_data -``` - -If missing, re-run `./configure` with the correct path. - ---- - -## πŸ›  **Uninstallation** - -To remove `libpostal`, run: - -```bash -sudo rm -rf /usr/local/lib/libpostal* -sudo rm -rf /usr/local/include/libpostal* -rm -rf ~/libpostal -pip uninstall postal -``` - ---- - -## πŸ“Œ **Additional Resources** - -- [Libpostal GitHub](https://github.com/openvenues/libpostal) -- [Libpostal Python Bindings](https://pypi.org/project/postal/) -- [Homebrew](https://brew.sh/) - ---- - -### πŸŽ‰ You’re all set! Now you can use `libpostal` to parse and clean address data efficiently. πŸš€ diff --git a/asset_list/requirements.txt b/asset_list/requirements.txt index d77c8a58..d6d64471 100644 --- a/asset_list/requirements.txt +++ b/asset_list/requirements.txt @@ -1,3 +1,8 @@ postal pandas -usaddress \ No newline at end of file +usaddress +pydantic-settings==2.6.0 +epc-api-python==1.0.2 +fuzzywuzzy +boto3 +openpyxl \ No newline at end of file diff --git a/asset_list/tests/test_standardisation.py b/asset_list/tests/test_standardisation.py index f0e6ce11..1a083bbc 100644 --- a/asset_list/tests/test_standardisation.py +++ b/asset_list/tests/test_standardisation.py @@ -1,9 +1,12 @@ from asset_list.AssetList import AssetList +from backend.SearchEpc import + def test_address1_extraction(): example = 'Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL' - AssetList._extract_address1( - example, - ) + # AssetList._extract_address1( + # example, + # ) + pass diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index e8a9dfaa..79a041ec 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -208,9 +208,14 @@ class SearchEpc: try: # Updated regex to catch house numbers including alphanumeric ones pattern = r'(?i)(?:flat|apartment)\s*(\d+\w*)|^\s*(\d+\w*)' - match = re.search(pattern, address) - if match: - return next(g for g in match.groups() if g is not None) + match1 = re.search(pattern, address) + if match1: + return next(g for g in match1.groups() if g is not None) + + pattern2 = r'(?i)(flat|apartment)\s*([a-zA-Z]?\d+[a-zA-Z]?)' + match2 = re.search(pattern2, address) + if match2: + return match2.group(2) parsed = usaddress.parse(address) # First, try to get the 'OccupancyIdentifier' if 'OccupancyType' is detected @@ -221,7 +226,8 @@ class SearchEpc: continue if part == postcode.split(" ")[1]: continue - return part # This assumes the first 'OccupancyIdentifier' after 'OccupancyType' is the primary + return part.rstrip( + ",") # This assumes the first 'OccupancyIdentifier' after 'OccupancyType' is the primary # number # Fallback to 'AddressNumber' if no 'OccupancyIdentifier' is found diff --git a/backend/tests/test_search_epc.py b/backend/tests/test_search_epc.py index 3b2e2a5b..562585ad 100644 --- a/backend/tests/test_search_epc.py +++ b/backend/tests/test_search_epc.py @@ -48,3 +48,12 @@ class TestSearchEpcIntegration: assert epc_searcher.newest_epc["lmk-key"] == lmk_key assert epc_searcher.newest_epc["uprn"] == uprn assert len(epc_searcher.older_epcs) == n_old_epcs + + def test_search_housenumber(self): + eg1 = 'Flat A11, Mortimer House, Grendon Road, Exeter' + res1 = SearchEpc.get_house_number(eg1, None) + assert res1 == "A11" + + eg2 = 'Flat A9, Mortimer House, Grendon Road, Exeter, EX1 2NL' + res2 = SearchEpc.get_house_number(eg2, None) + assert res2 == "A9" diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 74dc28e0..fcf11765 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -352,9 +352,11 @@ def app(): sheet_name=SHEET_NAME, address1_colname=ADDRESS1_COLUMN, postcode_colname=POSTCODE_COLUMN, + landlord_property_id="UPRN", full_address_colname=FULLADDRESS_COLUMN, full_address_cols_to_concat=ADDRESS_COLS_TO_CONCAT, missing_postcodes_method=MISSING_POSTCODES_METHOD, + address1_extraction_method=ADDRESS1_METHOD, landlord_year_built=PROPERTY_YEAR_BUILT, landlord_uprn=UPRN_COLUMN, landlord_property_type=PROPERTY_TYPE_COLUMN,