From 7e9347e530cc52fe38ceef66163447d6fd556b5e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 19 Feb 2025 12:53:09 +0000 Subject: [PATCH] setting up libpostal --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- asset_list/AssetList.py | 71 +++++++++- asset_list/README.md | 172 +++++++++++++++++++++++ asset_list/requirements.txt | 3 + asset_list/tests/test_standardisation.py | 9 ++ etl/route_march_data_pull/app.py | 18 ++- 7 files changed, 272 insertions(+), 5 deletions(-) create mode 100644 asset_list/README.md create mode 100644 asset_list/requirements.txt create mode 100644 asset_list/tests/test_standardisation.py diff --git a/.idea/Model.iml b/.idea/Model.iml index 762580d9..96ad7a95 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index c916a158..fb10c6b0 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 2a16e82f..35da9c3b 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -1,5 +1,10 @@ import os +import usaddress import pandas as pd +from utils.logger import setup_logger +from backend.SearchEpc import SearchEpc + +logger = setup_logger() class AssetList: @@ -15,6 +20,15 @@ class AssetList: "address1_extraction" # This method will use the NLP model to extract address1 ] + STANDARD_PROPERTY_TYPES = [ + "house", + "flat", + "bungalow", + "maisonette", + "park home", + "block house", + ] + def __init__( self, local_filepath, @@ -26,6 +40,10 @@ class AssetList: missing_postcodes_method=None, landlord_year_built=None, landlord_uprn=None, + landlord_property_type=None, + landlord_wall_construction=None, + landlord_heating_system=None, + landlord_existing_pv=None, header=0 ): self.local_filepath = local_filepath @@ -43,21 +61,72 @@ class AssetList: self.full_address_colname = full_address_colname self.landlord_year_built = landlord_year_built self.landlord_uprn = landlord_uprn + self.landlord_property_type = landlord_property_type + self.landlord_wall_construction = landlord_wall_construction + self.landlord_heating_system = landlord_heating_system + self.landlord_existing_pv = landlord_existing_pv # parameters for cleaning self.full_address_cols_to_concat = full_address_cols_to_concat self.missing_postcodes_method = missing_postcodes_method + self.debug_information = { + "property_type": None, + "wall_construction": None, + "heating_system": None, + "existing_pv": None + } + + @classmethod + def _extract_address1(cls, asset_list, full_address_col, postcode_col, method="first_two_words"): + + if method not in cls.ADDRESS_1_CLEANING_METHODS: + raise ValueError(f"Method {method} for producing address1 not recognized") + + if method == "first_two_words": + asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ") + return asset_list + + if method == "first_word": + asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0] + return asset_list + + if method == "house_number_extraction": + asset_list["address1_extracted"] = asset_list.apply( + lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]), + axis=1 + ) + return asset_list + + if method == "address1_extraction": + + x = asset_list_df[FULLADDRESS_COLUMN].values[0] + parsed = usaddress.parse(x) + + def extract_address_1(): + + + raise ValueError(f"Method {method} not recognized") + + @staticmethod + def _address1_extraction(x): + + def standardise(self): """ This function is used to standardise the asset list :return: standardised asset list """ + if self.address1_colname is None: + # If we do not have this, we produce it + + # We keep just the columns we care about and will work through the various columns and standardise self.standardised_asset_list = self.raw_asset_list[ [ - + self.address1_colname, self.postcode_colname, self.full_address_colname, + self.landlord_year_built, self.landlord_uprn, self.landlord_property_type ] ] diff --git a/asset_list/README.md b/asset_list/README.md new file mode 100644 index 00000000..1bf734a4 --- /dev/null +++ b/asset_list/README.md @@ -0,0 +1,172 @@ +# libpostal Installation Guide for macOS M1 + +## Overview + +`libpostal` is a fast, open-source address parsing and normalization library, designed for global addresses. This guide +provides detailed steps to install `libpostal` on macOS with Apple Silicon (M1/M2) and use it with Python. + +--- + +## πŸ“Œ Prerequisites + +Before installing `libpostal`, ensure you have the necessary dependencies installed. + +### **1️⃣ Install Required Dependencies** + +Open a terminal and run: + +```bash +brew install curl autoconf automake libtool pkg-config +``` + +### **2️⃣ Clone the libpostal Repository** + +```bash +git clone https://github.com/openvenues/libpostal.git +cd libpostal +``` + +### **3️⃣ Run Bootstrap Script** + +```bash +./bootstrap.sh +``` + +### **4️⃣ Configure the Build (Important for M1 Macs)** + +Since M1 chips do not support SSE2 natively, you **must** disable SSE2 for compatibility. + +```bash +./configure --disable-sse2 --datadir=/usr/local/libpostal_data +``` + +*(You can replace `/usr/local/libpostal_data` with another directory that has a few GB of space.)* + +### **5️⃣ Compile and Install** + +```bash +make -j$(sysctl -n hw.ncpu) +sudo make install +``` + +### **6️⃣ Install Python Bindings** + +Once `libpostal` is installed, install the Python package: + +```bash +pip install postal +``` + +--- + +## βœ… **Verify Installation** + +To check if `libpostal` was installed successfully, run: + +```bash +python -c "import postal; print(postal.parser.parse('23 Clifton Hill, Newtown, Exeter, EX1 2DL'))" +``` + +**Expected Output:** + +``` +[('23', 'house_number'), ('Clifton Hill', 'road'), ('Newtown', 'city'), ('Exeter', 'city'), ('EX1 2DL', 'postcode')] +``` + +--- + +## πŸ“Œ **Usage Example in Python** + +### **Address Parsing** + +```python +from postal.parser import parse + +address = "23 Clifton Hill, Newtown, Exeter, EX1 2DL" +parsed_address = dict(parse(address)) + +print(parsed_address) +``` + +**Expected Output:** + +```python +{ + 'house_number': '23', + 'road': 'Clifton Hill', + 'city': 'Newtown', + 'city': 'Exeter', + 'postcode': 'EX1 2DL' +} +``` + +### **Address Normalization** + +```python +from postal.normalize import normalize_string + +address = "Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL" +normalized = normalize_string(address) + +print(normalized) +``` + +--- + +## πŸ“Œ **Troubleshooting** + +### **1️⃣ libpostal Not Found?** + +If you encounter an error like `ModuleNotFoundError: No module named 'postal'`, make sure: + +- You ran `sudo make install` +- Your Python environment recognizes `postal`. Try: + ```bash + pip install postal --no-cache-dir + ``` +- If using a virtual environment (`venv`), activate it before running Python. + +### **2️⃣ Compilation Issues on macOS?** + +If `make` fails, try running: + +```bash +brew reinstall autoconf automake libtool pkg-config +``` + +Then restart the installation process. + +### **3️⃣ Can't Find libpostal Data Directory?** + +Ensure `libpostal_data` exists in the correct directory: + +```bash +ls /usr/local/libpostal_data +``` + +If missing, re-run `./configure` with the correct path. + +--- + +## πŸ›  **Uninstallation** + +To remove `libpostal`, run: + +```bash +sudo rm -rf /usr/local/lib/libpostal* +sudo rm -rf /usr/local/include/libpostal* +rm -rf ~/libpostal +pip uninstall postal +``` + +--- + +## πŸ“Œ **Additional Resources** + +- [Libpostal GitHub](https://github.com/openvenues/libpostal) +- [Libpostal Python Bindings](https://pypi.org/project/postal/) +- [Homebrew](https://brew.sh/) + +--- + +### πŸŽ‰ You’re all set! Now you can use `libpostal` to parse and clean address data efficiently. πŸš€ diff --git a/asset_list/requirements.txt b/asset_list/requirements.txt new file mode 100644 index 00000000..d77c8a58 --- /dev/null +++ b/asset_list/requirements.txt @@ -0,0 +1,3 @@ +postal +pandas +usaddress \ No newline at end of file diff --git a/asset_list/tests/test_standardisation.py b/asset_list/tests/test_standardisation.py new file mode 100644 index 00000000..f0e6ce11 --- /dev/null +++ b/asset_list/tests/test_standardisation.py @@ -0,0 +1,9 @@ +from asset_list.AssetList import AssetList + + +def test_address1_extraction(): + example = 'Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL' + + AssetList._extract_address1( + example, + ) diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 06082774..74dc28e0 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -346,10 +346,24 @@ def app(): invalid_property_types_dictionary = ["bedsit", "bed-sit", "bed sit"] - asset_list = AssetList( + self = AssetList( local_filepath=os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, - sheet_name=SHEET_NAME + sheet_name=SHEET_NAME, + address1_colname=ADDRESS1_COLUMN, + postcode_colname=POSTCODE_COLUMN, + full_address_colname=FULLADDRESS_COLUMN, + full_address_cols_to_concat=ADDRESS_COLS_TO_CONCAT, + missing_postcodes_method=MISSING_POSTCODES_METHOD, + landlord_year_built=PROPERTY_YEAR_BUILT, + landlord_uprn=UPRN_COLUMN, + landlord_property_type=PROPERTY_TYPE_COLUMN, + landlord_wall_construction="Wall Construction (EPC)", + landlord_heating_system="Heat Source", + landlord_existing_pv="PV (Y/N)" + ) + self.standardised_asset_list( + # In here, we might want to pass some specific remaps ) # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester"