mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
setting up libpostal
This commit is contained in:
parent
8432b7d202
commit
7e9347e530
7 changed files with 272 additions and 5 deletions
2
.idea/Model.iml
generated
2
.idea/Model.iml
generated
|
|
@ -7,7 +7,7 @@
|
|||
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Stonewater-wave-3" jdkType="Python SDK" />
|
||||
<orderEntry type="jdk" jdkName="AssetList" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="PyNamespacePackagesService">
|
||||
|
|
|
|||
2
.idea/misc.xml
generated
2
.idea/misc.xml
generated
|
|
@ -3,7 +3,7 @@
|
|||
<component name="Black">
|
||||
<option name="sdkName" value="Python 3.10 (backend)" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Stonewater-wave-3" project-jdk-type="Python SDK" />
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="AssetList" project-jdk-type="Python SDK" />
|
||||
<component name="PyCharmProfessionalAdvertiser">
|
||||
<option name="shown" value="true" />
|
||||
</component>
|
||||
|
|
|
|||
|
|
@ -1,5 +1,10 @@
|
|||
import os
|
||||
import usaddress
|
||||
import pandas as pd
|
||||
from utils.logger import setup_logger
|
||||
from backend.SearchEpc import SearchEpc
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
class AssetList:
|
||||
|
|
@ -15,6 +20,15 @@ class AssetList:
|
|||
"address1_extraction" # This method will use the NLP model to extract address1
|
||||
]
|
||||
|
||||
STANDARD_PROPERTY_TYPES = [
|
||||
"house",
|
||||
"flat",
|
||||
"bungalow",
|
||||
"maisonette",
|
||||
"park home",
|
||||
"block house",
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
local_filepath,
|
||||
|
|
@ -26,6 +40,10 @@ class AssetList:
|
|||
missing_postcodes_method=None,
|
||||
landlord_year_built=None,
|
||||
landlord_uprn=None,
|
||||
landlord_property_type=None,
|
||||
landlord_wall_construction=None,
|
||||
landlord_heating_system=None,
|
||||
landlord_existing_pv=None,
|
||||
header=0
|
||||
):
|
||||
self.local_filepath = local_filepath
|
||||
|
|
@ -43,21 +61,72 @@ class AssetList:
|
|||
self.full_address_colname = full_address_colname
|
||||
self.landlord_year_built = landlord_year_built
|
||||
self.landlord_uprn = landlord_uprn
|
||||
self.landlord_property_type = landlord_property_type
|
||||
self.landlord_wall_construction = landlord_wall_construction
|
||||
self.landlord_heating_system = landlord_heating_system
|
||||
self.landlord_existing_pv = landlord_existing_pv
|
||||
|
||||
# parameters for cleaning
|
||||
self.full_address_cols_to_concat = full_address_cols_to_concat
|
||||
self.missing_postcodes_method = missing_postcodes_method
|
||||
|
||||
self.debug_information = {
|
||||
"property_type": None,
|
||||
"wall_construction": None,
|
||||
"heating_system": None,
|
||||
"existing_pv": None
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def _extract_address1(cls, asset_list, full_address_col, postcode_col, method="first_two_words"):
|
||||
|
||||
if method not in cls.ADDRESS_1_CLEANING_METHODS:
|
||||
raise ValueError(f"Method {method} for producing address1 not recognized")
|
||||
|
||||
if method == "first_two_words":
|
||||
asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ")
|
||||
return asset_list
|
||||
|
||||
if method == "first_word":
|
||||
asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0]
|
||||
return asset_list
|
||||
|
||||
if method == "house_number_extraction":
|
||||
asset_list["address1_extracted"] = asset_list.apply(
|
||||
lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]),
|
||||
axis=1
|
||||
)
|
||||
return asset_list
|
||||
|
||||
if method == "address1_extraction":
|
||||
|
||||
x = asset_list_df[FULLADDRESS_COLUMN].values[0]
|
||||
parsed = usaddress.parse(x)
|
||||
|
||||
def extract_address_1():
|
||||
|
||||
|
||||
raise ValueError(f"Method {method} not recognized")
|
||||
|
||||
@staticmethod
|
||||
def _address1_extraction(x):
|
||||
|
||||
|
||||
def standardise(self):
|
||||
"""
|
||||
This function is used to standardise the asset list
|
||||
:return: standardised asset list
|
||||
"""
|
||||
|
||||
if self.address1_colname is None:
|
||||
# If we do not have this, we produce it
|
||||
|
||||
|
||||
# We keep just the columns we care about and will work through the various columns and standardise
|
||||
self.standardised_asset_list = self.raw_asset_list[
|
||||
[
|
||||
|
||||
self.address1_colname, self.postcode_colname, self.full_address_colname,
|
||||
self.landlord_year_built, self.landlord_uprn, self.landlord_property_type
|
||||
]
|
||||
]
|
||||
|
||||
|
|
|
|||
172
asset_list/README.md
Normal file
172
asset_list/README.md
Normal file
|
|
@ -0,0 +1,172 @@
|
|||
# libpostal Installation Guide for macOS M1
|
||||
|
||||
## Overview
|
||||
|
||||
`libpostal` is a fast, open-source address parsing and normalization library, designed for global addresses. This guide
|
||||
provides detailed steps to install `libpostal` on macOS with Apple Silicon (M1/M2) and use it with Python.
|
||||
|
||||
---
|
||||
|
||||
## 📌 Prerequisites
|
||||
|
||||
Before installing `libpostal`, ensure you have the necessary dependencies installed.
|
||||
|
||||
### **1️⃣ Install Required Dependencies**
|
||||
|
||||
Open a terminal and run:
|
||||
|
||||
```bash
|
||||
brew install curl autoconf automake libtool pkg-config
|
||||
```
|
||||
|
||||
### **2️⃣ Clone the libpostal Repository**
|
||||
|
||||
```bash
|
||||
git clone https://github.com/openvenues/libpostal.git
|
||||
cd libpostal
|
||||
```
|
||||
|
||||
### **3️⃣ Run Bootstrap Script**
|
||||
|
||||
```bash
|
||||
./bootstrap.sh
|
||||
```
|
||||
|
||||
### **4️⃣ Configure the Build (Important for M1 Macs)**
|
||||
|
||||
Since M1 chips do not support SSE2 natively, you **must** disable SSE2 for compatibility.
|
||||
|
||||
```bash
|
||||
./configure --disable-sse2 --datadir=/usr/local/libpostal_data
|
||||
```
|
||||
|
||||
*(You can replace `/usr/local/libpostal_data` with another directory that has a few GB of space.)*
|
||||
|
||||
### **5️⃣ Compile and Install**
|
||||
|
||||
```bash
|
||||
make -j$(sysctl -n hw.ncpu)
|
||||
sudo make install
|
||||
```
|
||||
|
||||
### **6️⃣ Install Python Bindings**
|
||||
|
||||
Once `libpostal` is installed, install the Python package:
|
||||
|
||||
```bash
|
||||
pip install postal
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## ✅ **Verify Installation**
|
||||
|
||||
To check if `libpostal` was installed successfully, run:
|
||||
|
||||
```bash
|
||||
python -c "import postal; print(postal.parser.parse('23 Clifton Hill, Newtown, Exeter, EX1 2DL'))"
|
||||
```
|
||||
|
||||
**Expected Output:**
|
||||
|
||||
```
|
||||
[('23', 'house_number'), ('Clifton Hill', 'road'), ('Newtown', 'city'), ('Exeter', 'city'), ('EX1 2DL', 'postcode')]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📌 **Usage Example in Python**
|
||||
|
||||
### **Address Parsing**
|
||||
|
||||
```python
|
||||
from postal.parser import parse
|
||||
|
||||
address = "23 Clifton Hill, Newtown, Exeter, EX1 2DL"
|
||||
parsed_address = dict(parse(address))
|
||||
|
||||
print(parsed_address)
|
||||
```
|
||||
|
||||
**Expected Output:**
|
||||
|
||||
```python
|
||||
{
|
||||
'house_number': '23',
|
||||
'road': 'Clifton Hill',
|
||||
'city': 'Newtown',
|
||||
'city': 'Exeter',
|
||||
'postcode': 'EX1 2DL'
|
||||
}
|
||||
```
|
||||
|
||||
### **Address Normalization**
|
||||
|
||||
```python
|
||||
from postal.normalize import normalize_string
|
||||
|
||||
address = "Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL"
|
||||
normalized = normalize_string(address)
|
||||
|
||||
print(normalized)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📌 **Troubleshooting**
|
||||
|
||||
### **1️⃣ libpostal Not Found?**
|
||||
|
||||
If you encounter an error like `ModuleNotFoundError: No module named 'postal'`, make sure:
|
||||
|
||||
- You ran `sudo make install`
|
||||
- Your Python environment recognizes `postal`. Try:
|
||||
```bash
|
||||
pip install postal --no-cache-dir
|
||||
```
|
||||
- If using a virtual environment (`venv`), activate it before running Python.
|
||||
|
||||
### **2️⃣ Compilation Issues on macOS?**
|
||||
|
||||
If `make` fails, try running:
|
||||
|
||||
```bash
|
||||
brew reinstall autoconf automake libtool pkg-config
|
||||
```
|
||||
|
||||
Then restart the installation process.
|
||||
|
||||
### **3️⃣ Can't Find libpostal Data Directory?**
|
||||
|
||||
Ensure `libpostal_data` exists in the correct directory:
|
||||
|
||||
```bash
|
||||
ls /usr/local/libpostal_data
|
||||
```
|
||||
|
||||
If missing, re-run `./configure` with the correct path.
|
||||
|
||||
---
|
||||
|
||||
## 🛠 **Uninstallation**
|
||||
|
||||
To remove `libpostal`, run:
|
||||
|
||||
```bash
|
||||
sudo rm -rf /usr/local/lib/libpostal*
|
||||
sudo rm -rf /usr/local/include/libpostal*
|
||||
rm -rf ~/libpostal
|
||||
pip uninstall postal
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📌 **Additional Resources**
|
||||
|
||||
- [Libpostal GitHub](https://github.com/openvenues/libpostal)
|
||||
- [Libpostal Python Bindings](https://pypi.org/project/postal/)
|
||||
- [Homebrew](https://brew.sh/)
|
||||
|
||||
---
|
||||
|
||||
### 🎉 You’re all set! Now you can use `libpostal` to parse and clean address data efficiently. 🚀
|
||||
3
asset_list/requirements.txt
Normal file
3
asset_list/requirements.txt
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
postal
|
||||
pandas
|
||||
usaddress
|
||||
9
asset_list/tests/test_standardisation.py
Normal file
9
asset_list/tests/test_standardisation.py
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
from asset_list.AssetList import AssetList
|
||||
|
||||
|
||||
def test_address1_extraction():
|
||||
example = 'Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL'
|
||||
|
||||
AssetList._extract_address1(
|
||||
example,
|
||||
)
|
||||
|
|
@ -346,10 +346,24 @@ def app():
|
|||
|
||||
invalid_property_types_dictionary = ["bedsit", "bed-sit", "bed sit"]
|
||||
|
||||
asset_list = AssetList(
|
||||
self = AssetList(
|
||||
local_filepath=os.path.join(DATA_FOLDER, DATA_FILENAME),
|
||||
header=0,
|
||||
sheet_name=SHEET_NAME
|
||||
sheet_name=SHEET_NAME,
|
||||
address1_colname=ADDRESS1_COLUMN,
|
||||
postcode_colname=POSTCODE_COLUMN,
|
||||
full_address_colname=FULLADDRESS_COLUMN,
|
||||
full_address_cols_to_concat=ADDRESS_COLS_TO_CONCAT,
|
||||
missing_postcodes_method=MISSING_POSTCODES_METHOD,
|
||||
landlord_year_built=PROPERTY_YEAR_BUILT,
|
||||
landlord_uprn=UPRN_COLUMN,
|
||||
landlord_property_type=PROPERTY_TYPE_COLUMN,
|
||||
landlord_wall_construction="Wall Construction (EPC)",
|
||||
landlord_heating_system="Heat Source",
|
||||
landlord_existing_pv="PV (Y/N)"
|
||||
)
|
||||
self.standardised_asset_list(
|
||||
# In here, we might want to pass some specific remaps
|
||||
)
|
||||
|
||||
# DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester"
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue