setting up libpostal

This commit is contained in:
Khalim Conn-Kowlessar 2025-02-19 12:53:09 +00:00
parent 8432b7d202
commit 7e9347e530
7 changed files with 272 additions and 5 deletions

2
.idea/Model.iml generated
View file

@ -7,7 +7,7 @@
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
</content>
<orderEntry type="jdk" jdkName="Stonewater-wave-3" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="AssetList" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyNamespacePackagesService">

2
.idea/misc.xml generated
View file

@ -3,7 +3,7 @@
<component name="Black">
<option name="sdkName" value="Python 3.10 (backend)" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Stonewater-wave-3" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="AssetList" project-jdk-type="Python SDK" />
<component name="PyCharmProfessionalAdvertiser">
<option name="shown" value="true" />
</component>

View file

@ -1,5 +1,10 @@
import os
import usaddress
import pandas as pd
from utils.logger import setup_logger
from backend.SearchEpc import SearchEpc
logger = setup_logger()
class AssetList:
@ -15,6 +20,15 @@ class AssetList:
"address1_extraction" # This method will use the NLP model to extract address1
]
STANDARD_PROPERTY_TYPES = [
"house",
"flat",
"bungalow",
"maisonette",
"park home",
"block house",
]
def __init__(
self,
local_filepath,
@ -26,6 +40,10 @@ class AssetList:
missing_postcodes_method=None,
landlord_year_built=None,
landlord_uprn=None,
landlord_property_type=None,
landlord_wall_construction=None,
landlord_heating_system=None,
landlord_existing_pv=None,
header=0
):
self.local_filepath = local_filepath
@ -43,21 +61,72 @@ class AssetList:
self.full_address_colname = full_address_colname
self.landlord_year_built = landlord_year_built
self.landlord_uprn = landlord_uprn
self.landlord_property_type = landlord_property_type
self.landlord_wall_construction = landlord_wall_construction
self.landlord_heating_system = landlord_heating_system
self.landlord_existing_pv = landlord_existing_pv
# parameters for cleaning
self.full_address_cols_to_concat = full_address_cols_to_concat
self.missing_postcodes_method = missing_postcodes_method
self.debug_information = {
"property_type": None,
"wall_construction": None,
"heating_system": None,
"existing_pv": None
}
@classmethod
def _extract_address1(cls, asset_list, full_address_col, postcode_col, method="first_two_words"):
if method not in cls.ADDRESS_1_CLEANING_METHODS:
raise ValueError(f"Method {method} for producing address1 not recognized")
if method == "first_two_words":
asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ")
return asset_list
if method == "first_word":
asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0]
return asset_list
if method == "house_number_extraction":
asset_list["address1_extracted"] = asset_list.apply(
lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]),
axis=1
)
return asset_list
if method == "address1_extraction":
x = asset_list_df[FULLADDRESS_COLUMN].values[0]
parsed = usaddress.parse(x)
def extract_address_1():
raise ValueError(f"Method {method} not recognized")
@staticmethod
def _address1_extraction(x):
def standardise(self):
"""
This function is used to standardise the asset list
:return: standardised asset list
"""
if self.address1_colname is None:
# If we do not have this, we produce it
# We keep just the columns we care about and will work through the various columns and standardise
self.standardised_asset_list = self.raw_asset_list[
[
self.address1_colname, self.postcode_colname, self.full_address_colname,
self.landlord_year_built, self.landlord_uprn, self.landlord_property_type
]
]

172
asset_list/README.md Normal file
View file

@ -0,0 +1,172 @@
# libpostal Installation Guide for macOS M1
## Overview
`libpostal` is a fast, open-source address parsing and normalization library, designed for global addresses. This guide
provides detailed steps to install `libpostal` on macOS with Apple Silicon (M1/M2) and use it with Python.
---
## 📌 Prerequisites
Before installing `libpostal`, ensure you have the necessary dependencies installed.
### **1⃣ Install Required Dependencies**
Open a terminal and run:
```bash
brew install curl autoconf automake libtool pkg-config
```
### **2⃣ Clone the libpostal Repository**
```bash
git clone https://github.com/openvenues/libpostal.git
cd libpostal
```
### **3⃣ Run Bootstrap Script**
```bash
./bootstrap.sh
```
### **4⃣ Configure the Build (Important for M1 Macs)**
Since M1 chips do not support SSE2 natively, you **must** disable SSE2 for compatibility.
```bash
./configure --disable-sse2 --datadir=/usr/local/libpostal_data
```
*(You can replace `/usr/local/libpostal_data` with another directory that has a few GB of space.)*
### **5⃣ Compile and Install**
```bash
make -j$(sysctl -n hw.ncpu)
sudo make install
```
### **6⃣ Install Python Bindings**
Once `libpostal` is installed, install the Python package:
```bash
pip install postal
```
---
## ✅ **Verify Installation**
To check if `libpostal` was installed successfully, run:
```bash
python -c "import postal; print(postal.parser.parse('23 Clifton Hill, Newtown, Exeter, EX1 2DL'))"
```
**Expected Output:**
```
[('23', 'house_number'), ('Clifton Hill', 'road'), ('Newtown', 'city'), ('Exeter', 'city'), ('EX1 2DL', 'postcode')]
```
---
## 📌 **Usage Example in Python**
### **Address Parsing**
```python
from postal.parser import parse
address = "23 Clifton Hill, Newtown, Exeter, EX1 2DL"
parsed_address = dict(parse(address))
print(parsed_address)
```
**Expected Output:**
```python
{
'house_number': '23',
'road': 'Clifton Hill',
'city': 'Newtown',
'city': 'Exeter',
'postcode': 'EX1 2DL'
}
```
### **Address Normalization**
```python
from postal.normalize import normalize_string
address = "Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL"
normalized = normalize_string(address)
print(normalized)
```
---
## 📌 **Troubleshooting**
### **1⃣ libpostal Not Found?**
If you encounter an error like `ModuleNotFoundError: No module named 'postal'`, make sure:
- You ran `sudo make install`
- Your Python environment recognizes `postal`. Try:
```bash
pip install postal --no-cache-dir
```
- If using a virtual environment (`venv`), activate it before running Python.
### **2⃣ Compilation Issues on macOS?**
If `make` fails, try running:
```bash
brew reinstall autoconf automake libtool pkg-config
```
Then restart the installation process.
### **3⃣ Can't Find libpostal Data Directory?**
Ensure `libpostal_data` exists in the correct directory:
```bash
ls /usr/local/libpostal_data
```
If missing, re-run `./configure` with the correct path.
---
## 🛠 **Uninstallation**
To remove `libpostal`, run:
```bash
sudo rm -rf /usr/local/lib/libpostal*
sudo rm -rf /usr/local/include/libpostal*
rm -rf ~/libpostal
pip uninstall postal
```
---
## 📌 **Additional Resources**
- [Libpostal GitHub](https://github.com/openvenues/libpostal)
- [Libpostal Python Bindings](https://pypi.org/project/postal/)
- [Homebrew](https://brew.sh/)
---
### 🎉 Youre all set! Now you can use `libpostal` to parse and clean address data efficiently. 🚀

View file

@ -0,0 +1,3 @@
postal
pandas
usaddress

View file

@ -0,0 +1,9 @@
from asset_list.AssetList import AssetList
def test_address1_extraction():
example = 'Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL'
AssetList._extract_address1(
example,
)

View file

@ -346,10 +346,24 @@ def app():
invalid_property_types_dictionary = ["bedsit", "bed-sit", "bed sit"]
asset_list = AssetList(
self = AssetList(
local_filepath=os.path.join(DATA_FOLDER, DATA_FILENAME),
header=0,
sheet_name=SHEET_NAME
sheet_name=SHEET_NAME,
address1_colname=ADDRESS1_COLUMN,
postcode_colname=POSTCODE_COLUMN,
full_address_colname=FULLADDRESS_COLUMN,
full_address_cols_to_concat=ADDRESS_COLS_TO_CONCAT,
missing_postcodes_method=MISSING_POSTCODES_METHOD,
landlord_year_built=PROPERTY_YEAR_BUILT,
landlord_uprn=UPRN_COLUMN,
landlord_property_type=PROPERTY_TYPE_COLUMN,
landlord_wall_construction="Wall Construction (EPC)",
landlord_heating_system="Heat Source",
landlord_existing_pv="PV (Y/N)"
)
self.standardised_asset_list(
# In here, we might want to pass some specific remaps
)
# DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester"