working on address extraction

This commit is contained in:
Khalim Conn-Kowlessar 2025-02-19 14:12:57 +00:00
parent 7e9347e530
commit cb0194c3b9
7 changed files with 130 additions and 202 deletions

View file

@ -17,7 +17,7 @@ class AssetList:
"first_two_words", # This method will split on the fist two words, where the separator is a space
"first_word", # This method will split on the first word, where the separator is a space
"house_number_extraction", # This method will use the NLP model in SearchEPC to extract the housenumber
"address1_extraction" # This method will use the NLP model to extract address1
# "address1_extraction" # This method will use the NLP model to extract address1
]
STANDARD_PROPERTY_TYPES = [
@ -29,6 +29,19 @@ class AssetList:
"block house",
]
# Standard column Names
STANDARD_ADDRESS_1 = "domna_address_1"
STANDARD_POSTCODE = "domna_postcode"
STANDARD_FULL_ADDRESS = "domna_full_address"
STANDARD_YEAR_BUILT = "domna_year_built"
STANDARD_UPRN = "ordnance_survey_uprn"
STANDARD_PROPERTY_TYPE = "landlord_property_type"
STANDARD_WALL_CONSTRUCTION = "landlord_wall_construction"
STANDARD_HEATING_SYSTEM = "landlord_heating_system"
STANDARD_EXISTING_PV = "landlord_existing_pv"
DOMNA_PROPERTY_ID = "domna_property_id"
def __init__(
self,
local_filepath,
@ -36,8 +49,10 @@ class AssetList:
address1_colname,
postcode_colname,
full_address_colname,
landlord_property_id=None,
full_address_cols_to_concat=None,
missing_postcodes_method=None,
address1_extraction_method=None,
landlord_year_built=None,
landlord_uprn=None,
landlord_property_type=None,
@ -48,14 +63,15 @@ class AssetList:
):
self.local_filepath = local_filepath
self.sheet_name = sheet_name
self.standardised_asset_list = None
# Read in the data
self.raw_asset_list = pd.read_excel(local_filepath, header=header, sheet_name=sheet_name)
self.standardised_asset_list = self.raw_asset_list.copy()
# We detect the presence of the non-intrusive columns
self.non_intrusives_present = True if "CIGA Check Required" in self.raw_asset_list.columns else False
# Names of columns
self.landlord_property_id = landlord_property_id
self.address1_colname = address1_colname
self.postcode_colname = postcode_colname
self.full_address_colname = full_address_colname
@ -69,6 +85,7 @@ class AssetList:
# parameters for cleaning
self.full_address_cols_to_concat = full_address_cols_to_concat
self.missing_postcodes_method = missing_postcodes_method
self.address1_extraction_method = address1_extraction_method
self.debug_information = {
"property_type": None,
@ -77,40 +94,50 @@ class AssetList:
"existing_pv": None
}
@classmethod
def _extract_address1(cls, asset_list, full_address_col, postcode_col, method="first_two_words"):
def _extract_address1(self, asset_list, full_address_col, postcode_col, method="first_two_words"):
if method not in cls.ADDRESS_1_CLEANING_METHODS:
if method not in self.ADDRESS_1_CLEANING_METHODS:
raise ValueError(f"Method {method} for producing address1 not recognized")
if method == "first_two_words":
asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ")
asset_list[self.address1_colname] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ")
return asset_list
if method == "first_word":
asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0]
asset_list[self.address1_colname] = asset_list[full_address_col].str.split(" ").str[0]
return asset_list
if method == "house_number_extraction":
asset_list["address1_extracted"] = asset_list.apply(
asset_list[self.address1_colname] = asset_list.apply(
lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]),
axis=1
)
return asset_list
if method == "address1_extraction":
x = asset_list_df[FULLADDRESS_COLUMN].values[0]
parsed = usaddress.parse(x)
def extract_address_1():
raise ValueError(f"Method {method} not recognized")
raise ValueError(f"Method {method} not recognized")
@staticmethod
def _address1_extraction(x):
pass
def create_property_id(self):
"""
This function creates the domna property ID, which is simply a hash of the full address and postcode
We want all figures to be positive
:return:
"""
import sys
self.standardised_asset_list[self.DOMNA_PROPERTY_ID] = (
self.standardised_asset_list[self.full_address_colname] + self.standardised_asset_list[
self.postcode_colname]
).apply(lambda x: hash(x) % 2 ** sys.hash_info.width)
@staticmethod
def _strip_postcode_from_full_address(full_address, postcode):
cleaned = full_address.replace(postcode, "")
# Remove any trailing commas and spaces
cleaned = cleaned.rstrip(", ").strip(",").strip()
return cleaned
def standardise(self):
"""
@ -118,15 +145,63 @@ class AssetList:
:return: standardised asset list
"""
if self.address1_colname is None:
# If we do not have this, we produce it
# Remove rows without a postcode
if self.postcode_colname is not None:
self.standardised_asset_list = self.standardised_asset_list.dropna(subset=[self.postcode_colname])
# We clean up portential non-breaking spaces, and double spaces
for col in [
c for c in [self.postcode_colname, self.full_address_colname, self.address1_colname] if
c is not None
]:
self.standardised_asset_list[col] = self.standardised_asset_list[col].astype(str)
self.standardised_asset_list[col] = self.standardised_asset_list[col].str.replace('\xa0', ' ', regex=False)
self.standardised_asset_list[col] = self.standardised_asset_list[col].str.replace(' ', ' ', regex=False)
if self.address1_colname is None:
if self.address1_extraction_method is None:
raise ValueError("Missing address 1 - please specify an extraction method")
self.address1_colname = self.STANDARD_ADDRESS_1
# If we do not have this, we produce it
self.standardised_asset_list = self._extract_address1(
asset_list=self.standardised_asset_list,
full_address_col=self.full_address_colname,
postcode_col=self.postcode_colname,
method=self.address1_extraction_method
)
if self.full_address_colname is None:
if not self.full_address_cols_to_concat:
raise ValueError("Missing full address - please specify columns to concatenate")
self.full_address_colname = self.STANDARD_FULL_ADDRESS
self.standardised_asset_list[self.full_address_colname] = (
self.standardised_asset_list[self.full_address_cols_to_concat].apply(lambda x: ", ".join(x), axis=1)
)
else:
# Make sure to strip the postcode out of the full address
self.standardised_asset_list[self.full_address_colname] = self.standardised_asset_list.apply(
lambda x: self._strip_postcode_from_full_address(
full_address=x[self.full_address_colname],
postcode=x[self.postcode_colname]
),
axis=1
)
# We create the domna property id
self.create_property_id()
# We keep just the columns we care about and will work through the various columns and standardise
self.standardised_asset_list = self.raw_asset_list[
self.standardised_asset_list = self.standardised_asset_list[
[
self.address1_colname, self.postcode_colname, self.full_address_colname,
self.landlord_year_built, self.landlord_uprn, self.landlord_property_type
self.landlord_property_id,
self.DOMNA_PROPERTY_ID,
self.address1_colname,
self.postcode_colname,
self.full_address_colname,
self.landlord_year_built,
self.landlord_uprn,
self.landlord_property_type,
]
]

View file

@ -1,172 +0,0 @@
# libpostal Installation Guide for macOS M1
## Overview
`libpostal` is a fast, open-source address parsing and normalization library, designed for global addresses. This guide
provides detailed steps to install `libpostal` on macOS with Apple Silicon (M1/M2) and use it with Python.
---
## 📌 Prerequisites
Before installing `libpostal`, ensure you have the necessary dependencies installed.
### **1⃣ Install Required Dependencies**
Open a terminal and run:
```bash
brew install curl autoconf automake libtool pkg-config
```
### **2⃣ Clone the libpostal Repository**
```bash
git clone https://github.com/openvenues/libpostal.git
cd libpostal
```
### **3⃣ Run Bootstrap Script**
```bash
./bootstrap.sh
```
### **4⃣ Configure the Build (Important for M1 Macs)**
Since M1 chips do not support SSE2 natively, you **must** disable SSE2 for compatibility.
```bash
./configure --disable-sse2 --datadir=/usr/local/libpostal_data
```
*(You can replace `/usr/local/libpostal_data` with another directory that has a few GB of space.)*
### **5⃣ Compile and Install**
```bash
make -j$(sysctl -n hw.ncpu)
sudo make install
```
### **6⃣ Install Python Bindings**
Once `libpostal` is installed, install the Python package:
```bash
pip install postal
```
---
## ✅ **Verify Installation**
To check if `libpostal` was installed successfully, run:
```bash
python -c "import postal; print(postal.parser.parse('23 Clifton Hill, Newtown, Exeter, EX1 2DL'))"
```
**Expected Output:**
```
[('23', 'house_number'), ('Clifton Hill', 'road'), ('Newtown', 'city'), ('Exeter', 'city'), ('EX1 2DL', 'postcode')]
```
---
## 📌 **Usage Example in Python**
### **Address Parsing**
```python
from postal.parser import parse
address = "23 Clifton Hill, Newtown, Exeter, EX1 2DL"
parsed_address = dict(parse(address))
print(parsed_address)
```
**Expected Output:**
```python
{
'house_number': '23',
'road': 'Clifton Hill',
'city': 'Newtown',
'city': 'Exeter',
'postcode': 'EX1 2DL'
}
```
### **Address Normalization**
```python
from postal.normalize import normalize_string
address = "Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL"
normalized = normalize_string(address)
print(normalized)
```
---
## 📌 **Troubleshooting**
### **1⃣ libpostal Not Found?**
If you encounter an error like `ModuleNotFoundError: No module named 'postal'`, make sure:
- You ran `sudo make install`
- Your Python environment recognizes `postal`. Try:
```bash
pip install postal --no-cache-dir
```
- If using a virtual environment (`venv`), activate it before running Python.
### **2⃣ Compilation Issues on macOS?**
If `make` fails, try running:
```bash
brew reinstall autoconf automake libtool pkg-config
```
Then restart the installation process.
### **3⃣ Can't Find libpostal Data Directory?**
Ensure `libpostal_data` exists in the correct directory:
```bash
ls /usr/local/libpostal_data
```
If missing, re-run `./configure` with the correct path.
---
## 🛠 **Uninstallation**
To remove `libpostal`, run:
```bash
sudo rm -rf /usr/local/lib/libpostal*
sudo rm -rf /usr/local/include/libpostal*
rm -rf ~/libpostal
pip uninstall postal
```
---
## 📌 **Additional Resources**
- [Libpostal GitHub](https://github.com/openvenues/libpostal)
- [Libpostal Python Bindings](https://pypi.org/project/postal/)
- [Homebrew](https://brew.sh/)
---
### 🎉 Youre all set! Now you can use `libpostal` to parse and clean address data efficiently. 🚀

View file

@ -1,3 +1,8 @@
postal
pandas
usaddress
usaddress
pydantic-settings==2.6.0
epc-api-python==1.0.2
fuzzywuzzy
boto3
openpyxl

View file

@ -1,9 +1,12 @@
from asset_list.AssetList import AssetList
from backend.SearchEpc import
def test_address1_extraction():
example = 'Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL'
AssetList._extract_address1(
example,
)
# AssetList._extract_address1(
# example,
# )
pass

View file

@ -208,9 +208,14 @@ class SearchEpc:
try:
# Updated regex to catch house numbers including alphanumeric ones
pattern = r'(?i)(?:flat|apartment)\s*(\d+\w*)|^\s*(\d+\w*)'
match = re.search(pattern, address)
if match:
return next(g for g in match.groups() if g is not None)
match1 = re.search(pattern, address)
if match1:
return next(g for g in match1.groups() if g is not None)
pattern2 = r'(?i)(flat|apartment)\s*([a-zA-Z]?\d+[a-zA-Z]?)'
match2 = re.search(pattern2, address)
if match2:
return match2.group(2)
parsed = usaddress.parse(address)
# First, try to get the 'OccupancyIdentifier' if 'OccupancyType' is detected
@ -221,7 +226,8 @@ class SearchEpc:
continue
if part == postcode.split(" ")[1]:
continue
return part # This assumes the first 'OccupancyIdentifier' after 'OccupancyType' is the primary
return part.rstrip(
",") # This assumes the first 'OccupancyIdentifier' after 'OccupancyType' is the primary
# number
# Fallback to 'AddressNumber' if no 'OccupancyIdentifier' is found

View file

@ -48,3 +48,12 @@ class TestSearchEpcIntegration:
assert epc_searcher.newest_epc["lmk-key"] == lmk_key
assert epc_searcher.newest_epc["uprn"] == uprn
assert len(epc_searcher.older_epcs) == n_old_epcs
def test_search_housenumber(self):
eg1 = 'Flat A11, Mortimer House, Grendon Road, Exeter'
res1 = SearchEpc.get_house_number(eg1, None)
assert res1 == "A11"
eg2 = 'Flat A9, Mortimer House, Grendon Road, Exeter, EX1 2NL'
res2 = SearchEpc.get_house_number(eg2, None)
assert res2 == "A9"

View file

@ -352,9 +352,11 @@ def app():
sheet_name=SHEET_NAME,
address1_colname=ADDRESS1_COLUMN,
postcode_colname=POSTCODE_COLUMN,
landlord_property_id="UPRN",
full_address_colname=FULLADDRESS_COLUMN,
full_address_cols_to_concat=ADDRESS_COLS_TO_CONCAT,
missing_postcodes_method=MISSING_POSTCODES_METHOD,
address1_extraction_method=ADDRESS1_METHOD,
landlord_year_built=PROPERTY_YEAR_BUILT,
landlord_uprn=UPRN_COLUMN,
landlord_property_type=PROPERTY_TYPE_COLUMN,