mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
working on address extraction
This commit is contained in:
parent
7e9347e530
commit
cb0194c3b9
7 changed files with 130 additions and 202 deletions
|
|
@ -17,7 +17,7 @@ class AssetList:
|
|||
"first_two_words", # This method will split on the fist two words, where the separator is a space
|
||||
"first_word", # This method will split on the first word, where the separator is a space
|
||||
"house_number_extraction", # This method will use the NLP model in SearchEPC to extract the housenumber
|
||||
"address1_extraction" # This method will use the NLP model to extract address1
|
||||
# "address1_extraction" # This method will use the NLP model to extract address1
|
||||
]
|
||||
|
||||
STANDARD_PROPERTY_TYPES = [
|
||||
|
|
@ -29,6 +29,19 @@ class AssetList:
|
|||
"block house",
|
||||
]
|
||||
|
||||
# Standard column Names
|
||||
STANDARD_ADDRESS_1 = "domna_address_1"
|
||||
STANDARD_POSTCODE = "domna_postcode"
|
||||
STANDARD_FULL_ADDRESS = "domna_full_address"
|
||||
STANDARD_YEAR_BUILT = "domna_year_built"
|
||||
STANDARD_UPRN = "ordnance_survey_uprn"
|
||||
STANDARD_PROPERTY_TYPE = "landlord_property_type"
|
||||
STANDARD_WALL_CONSTRUCTION = "landlord_wall_construction"
|
||||
STANDARD_HEATING_SYSTEM = "landlord_heating_system"
|
||||
STANDARD_EXISTING_PV = "landlord_existing_pv"
|
||||
|
||||
DOMNA_PROPERTY_ID = "domna_property_id"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
local_filepath,
|
||||
|
|
@ -36,8 +49,10 @@ class AssetList:
|
|||
address1_colname,
|
||||
postcode_colname,
|
||||
full_address_colname,
|
||||
landlord_property_id=None,
|
||||
full_address_cols_to_concat=None,
|
||||
missing_postcodes_method=None,
|
||||
address1_extraction_method=None,
|
||||
landlord_year_built=None,
|
||||
landlord_uprn=None,
|
||||
landlord_property_type=None,
|
||||
|
|
@ -48,14 +63,15 @@ class AssetList:
|
|||
):
|
||||
self.local_filepath = local_filepath
|
||||
self.sheet_name = sheet_name
|
||||
self.standardised_asset_list = None
|
||||
# Read in the data
|
||||
self.raw_asset_list = pd.read_excel(local_filepath, header=header, sheet_name=sheet_name)
|
||||
self.standardised_asset_list = self.raw_asset_list.copy()
|
||||
|
||||
# We detect the presence of the non-intrusive columns
|
||||
self.non_intrusives_present = True if "CIGA Check Required" in self.raw_asset_list.columns else False
|
||||
|
||||
# Names of columns
|
||||
self.landlord_property_id = landlord_property_id
|
||||
self.address1_colname = address1_colname
|
||||
self.postcode_colname = postcode_colname
|
||||
self.full_address_colname = full_address_colname
|
||||
|
|
@ -69,6 +85,7 @@ class AssetList:
|
|||
# parameters for cleaning
|
||||
self.full_address_cols_to_concat = full_address_cols_to_concat
|
||||
self.missing_postcodes_method = missing_postcodes_method
|
||||
self.address1_extraction_method = address1_extraction_method
|
||||
|
||||
self.debug_information = {
|
||||
"property_type": None,
|
||||
|
|
@ -77,40 +94,50 @@ class AssetList:
|
|||
"existing_pv": None
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def _extract_address1(cls, asset_list, full_address_col, postcode_col, method="first_two_words"):
|
||||
def _extract_address1(self, asset_list, full_address_col, postcode_col, method="first_two_words"):
|
||||
|
||||
if method not in cls.ADDRESS_1_CLEANING_METHODS:
|
||||
if method not in self.ADDRESS_1_CLEANING_METHODS:
|
||||
raise ValueError(f"Method {method} for producing address1 not recognized")
|
||||
|
||||
if method == "first_two_words":
|
||||
asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ")
|
||||
asset_list[self.address1_colname] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ")
|
||||
return asset_list
|
||||
|
||||
if method == "first_word":
|
||||
asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0]
|
||||
asset_list[self.address1_colname] = asset_list[full_address_col].str.split(" ").str[0]
|
||||
return asset_list
|
||||
|
||||
if method == "house_number_extraction":
|
||||
asset_list["address1_extracted"] = asset_list.apply(
|
||||
asset_list[self.address1_colname] = asset_list.apply(
|
||||
lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]),
|
||||
axis=1
|
||||
)
|
||||
return asset_list
|
||||
|
||||
if method == "address1_extraction":
|
||||
|
||||
x = asset_list_df[FULLADDRESS_COLUMN].values[0]
|
||||
parsed = usaddress.parse(x)
|
||||
|
||||
def extract_address_1():
|
||||
|
||||
|
||||
raise ValueError(f"Method {method} not recognized")
|
||||
raise ValueError(f"Method {method} not recognized")
|
||||
|
||||
@staticmethod
|
||||
def _address1_extraction(x):
|
||||
pass
|
||||
|
||||
def create_property_id(self):
|
||||
"""
|
||||
This function creates the domna property ID, which is simply a hash of the full address and postcode
|
||||
We want all figures to be positive
|
||||
:return:
|
||||
"""
|
||||
import sys
|
||||
self.standardised_asset_list[self.DOMNA_PROPERTY_ID] = (
|
||||
self.standardised_asset_list[self.full_address_colname] + self.standardised_asset_list[
|
||||
self.postcode_colname]
|
||||
).apply(lambda x: hash(x) % 2 ** sys.hash_info.width)
|
||||
|
||||
@staticmethod
|
||||
def _strip_postcode_from_full_address(full_address, postcode):
|
||||
cleaned = full_address.replace(postcode, "")
|
||||
# Remove any trailing commas and spaces
|
||||
cleaned = cleaned.rstrip(", ").strip(",").strip()
|
||||
return cleaned
|
||||
|
||||
def standardise(self):
|
||||
"""
|
||||
|
|
@ -118,15 +145,63 @@ class AssetList:
|
|||
:return: standardised asset list
|
||||
"""
|
||||
|
||||
if self.address1_colname is None:
|
||||
# If we do not have this, we produce it
|
||||
# Remove rows without a postcode
|
||||
if self.postcode_colname is not None:
|
||||
self.standardised_asset_list = self.standardised_asset_list.dropna(subset=[self.postcode_colname])
|
||||
|
||||
# We clean up portential non-breaking spaces, and double spaces
|
||||
for col in [
|
||||
c for c in [self.postcode_colname, self.full_address_colname, self.address1_colname] if
|
||||
c is not None
|
||||
]:
|
||||
self.standardised_asset_list[col] = self.standardised_asset_list[col].astype(str)
|
||||
self.standardised_asset_list[col] = self.standardised_asset_list[col].str.replace('\xa0', ' ', regex=False)
|
||||
self.standardised_asset_list[col] = self.standardised_asset_list[col].str.replace(' ', ' ', regex=False)
|
||||
|
||||
if self.address1_colname is None:
|
||||
if self.address1_extraction_method is None:
|
||||
raise ValueError("Missing address 1 - please specify an extraction method")
|
||||
self.address1_colname = self.STANDARD_ADDRESS_1
|
||||
# If we do not have this, we produce it
|
||||
self.standardised_asset_list = self._extract_address1(
|
||||
asset_list=self.standardised_asset_list,
|
||||
full_address_col=self.full_address_colname,
|
||||
postcode_col=self.postcode_colname,
|
||||
method=self.address1_extraction_method
|
||||
)
|
||||
|
||||
if self.full_address_colname is None:
|
||||
if not self.full_address_cols_to_concat:
|
||||
raise ValueError("Missing full address - please specify columns to concatenate")
|
||||
self.full_address_colname = self.STANDARD_FULL_ADDRESS
|
||||
self.standardised_asset_list[self.full_address_colname] = (
|
||||
self.standardised_asset_list[self.full_address_cols_to_concat].apply(lambda x: ", ".join(x), axis=1)
|
||||
)
|
||||
else:
|
||||
|
||||
# Make sure to strip the postcode out of the full address
|
||||
self.standardised_asset_list[self.full_address_colname] = self.standardised_asset_list.apply(
|
||||
lambda x: self._strip_postcode_from_full_address(
|
||||
full_address=x[self.full_address_colname],
|
||||
postcode=x[self.postcode_colname]
|
||||
),
|
||||
axis=1
|
||||
)
|
||||
|
||||
# We create the domna property id
|
||||
self.create_property_id()
|
||||
|
||||
# We keep just the columns we care about and will work through the various columns and standardise
|
||||
self.standardised_asset_list = self.raw_asset_list[
|
||||
self.standardised_asset_list = self.standardised_asset_list[
|
||||
[
|
||||
self.address1_colname, self.postcode_colname, self.full_address_colname,
|
||||
self.landlord_year_built, self.landlord_uprn, self.landlord_property_type
|
||||
self.landlord_property_id,
|
||||
self.DOMNA_PROPERTY_ID,
|
||||
self.address1_colname,
|
||||
self.postcode_colname,
|
||||
self.full_address_colname,
|
||||
self.landlord_year_built,
|
||||
self.landlord_uprn,
|
||||
self.landlord_property_type,
|
||||
]
|
||||
]
|
||||
|
||||
|
|
|
|||
|
|
@ -1,172 +0,0 @@
|
|||
# libpostal Installation Guide for macOS M1
|
||||
|
||||
## Overview
|
||||
|
||||
`libpostal` is a fast, open-source address parsing and normalization library, designed for global addresses. This guide
|
||||
provides detailed steps to install `libpostal` on macOS with Apple Silicon (M1/M2) and use it with Python.
|
||||
|
||||
---
|
||||
|
||||
## 📌 Prerequisites
|
||||
|
||||
Before installing `libpostal`, ensure you have the necessary dependencies installed.
|
||||
|
||||
### **1️⃣ Install Required Dependencies**
|
||||
|
||||
Open a terminal and run:
|
||||
|
||||
```bash
|
||||
brew install curl autoconf automake libtool pkg-config
|
||||
```
|
||||
|
||||
### **2️⃣ Clone the libpostal Repository**
|
||||
|
||||
```bash
|
||||
git clone https://github.com/openvenues/libpostal.git
|
||||
cd libpostal
|
||||
```
|
||||
|
||||
### **3️⃣ Run Bootstrap Script**
|
||||
|
||||
```bash
|
||||
./bootstrap.sh
|
||||
```
|
||||
|
||||
### **4️⃣ Configure the Build (Important for M1 Macs)**
|
||||
|
||||
Since M1 chips do not support SSE2 natively, you **must** disable SSE2 for compatibility.
|
||||
|
||||
```bash
|
||||
./configure --disable-sse2 --datadir=/usr/local/libpostal_data
|
||||
```
|
||||
|
||||
*(You can replace `/usr/local/libpostal_data` with another directory that has a few GB of space.)*
|
||||
|
||||
### **5️⃣ Compile and Install**
|
||||
|
||||
```bash
|
||||
make -j$(sysctl -n hw.ncpu)
|
||||
sudo make install
|
||||
```
|
||||
|
||||
### **6️⃣ Install Python Bindings**
|
||||
|
||||
Once `libpostal` is installed, install the Python package:
|
||||
|
||||
```bash
|
||||
pip install postal
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## ✅ **Verify Installation**
|
||||
|
||||
To check if `libpostal` was installed successfully, run:
|
||||
|
||||
```bash
|
||||
python -c "import postal; print(postal.parser.parse('23 Clifton Hill, Newtown, Exeter, EX1 2DL'))"
|
||||
```
|
||||
|
||||
**Expected Output:**
|
||||
|
||||
```
|
||||
[('23', 'house_number'), ('Clifton Hill', 'road'), ('Newtown', 'city'), ('Exeter', 'city'), ('EX1 2DL', 'postcode')]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📌 **Usage Example in Python**
|
||||
|
||||
### **Address Parsing**
|
||||
|
||||
```python
|
||||
from postal.parser import parse
|
||||
|
||||
address = "23 Clifton Hill, Newtown, Exeter, EX1 2DL"
|
||||
parsed_address = dict(parse(address))
|
||||
|
||||
print(parsed_address)
|
||||
```
|
||||
|
||||
**Expected Output:**
|
||||
|
||||
```python
|
||||
{
|
||||
'house_number': '23',
|
||||
'road': 'Clifton Hill',
|
||||
'city': 'Newtown',
|
||||
'city': 'Exeter',
|
||||
'postcode': 'EX1 2DL'
|
||||
}
|
||||
```
|
||||
|
||||
### **Address Normalization**
|
||||
|
||||
```python
|
||||
from postal.normalize import normalize_string
|
||||
|
||||
address = "Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL"
|
||||
normalized = normalize_string(address)
|
||||
|
||||
print(normalized)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📌 **Troubleshooting**
|
||||
|
||||
### **1️⃣ libpostal Not Found?**
|
||||
|
||||
If you encounter an error like `ModuleNotFoundError: No module named 'postal'`, make sure:
|
||||
|
||||
- You ran `sudo make install`
|
||||
- Your Python environment recognizes `postal`. Try:
|
||||
```bash
|
||||
pip install postal --no-cache-dir
|
||||
```
|
||||
- If using a virtual environment (`venv`), activate it before running Python.
|
||||
|
||||
### **2️⃣ Compilation Issues on macOS?**
|
||||
|
||||
If `make` fails, try running:
|
||||
|
||||
```bash
|
||||
brew reinstall autoconf automake libtool pkg-config
|
||||
```
|
||||
|
||||
Then restart the installation process.
|
||||
|
||||
### **3️⃣ Can't Find libpostal Data Directory?**
|
||||
|
||||
Ensure `libpostal_data` exists in the correct directory:
|
||||
|
||||
```bash
|
||||
ls /usr/local/libpostal_data
|
||||
```
|
||||
|
||||
If missing, re-run `./configure` with the correct path.
|
||||
|
||||
---
|
||||
|
||||
## 🛠 **Uninstallation**
|
||||
|
||||
To remove `libpostal`, run:
|
||||
|
||||
```bash
|
||||
sudo rm -rf /usr/local/lib/libpostal*
|
||||
sudo rm -rf /usr/local/include/libpostal*
|
||||
rm -rf ~/libpostal
|
||||
pip uninstall postal
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📌 **Additional Resources**
|
||||
|
||||
- [Libpostal GitHub](https://github.com/openvenues/libpostal)
|
||||
- [Libpostal Python Bindings](https://pypi.org/project/postal/)
|
||||
- [Homebrew](https://brew.sh/)
|
||||
|
||||
---
|
||||
|
||||
### 🎉 You’re all set! Now you can use `libpostal` to parse and clean address data efficiently. 🚀
|
||||
|
|
@ -1,3 +1,8 @@
|
|||
postal
|
||||
pandas
|
||||
usaddress
|
||||
usaddress
|
||||
pydantic-settings==2.6.0
|
||||
epc-api-python==1.0.2
|
||||
fuzzywuzzy
|
||||
boto3
|
||||
openpyxl
|
||||
|
|
@ -1,9 +1,12 @@
|
|||
from asset_list.AssetList import AssetList
|
||||
from backend.SearchEpc import
|
||||
|
||||
|
||||
|
||||
def test_address1_extraction():
|
||||
example = 'Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL'
|
||||
|
||||
AssetList._extract_address1(
|
||||
example,
|
||||
)
|
||||
# AssetList._extract_address1(
|
||||
# example,
|
||||
# )
|
||||
pass
|
||||
|
|
|
|||
|
|
@ -208,9 +208,14 @@ class SearchEpc:
|
|||
try:
|
||||
# Updated regex to catch house numbers including alphanumeric ones
|
||||
pattern = r'(?i)(?:flat|apartment)\s*(\d+\w*)|^\s*(\d+\w*)'
|
||||
match = re.search(pattern, address)
|
||||
if match:
|
||||
return next(g for g in match.groups() if g is not None)
|
||||
match1 = re.search(pattern, address)
|
||||
if match1:
|
||||
return next(g for g in match1.groups() if g is not None)
|
||||
|
||||
pattern2 = r'(?i)(flat|apartment)\s*([a-zA-Z]?\d+[a-zA-Z]?)'
|
||||
match2 = re.search(pattern2, address)
|
||||
if match2:
|
||||
return match2.group(2)
|
||||
|
||||
parsed = usaddress.parse(address)
|
||||
# First, try to get the 'OccupancyIdentifier' if 'OccupancyType' is detected
|
||||
|
|
@ -221,7 +226,8 @@ class SearchEpc:
|
|||
continue
|
||||
if part == postcode.split(" ")[1]:
|
||||
continue
|
||||
return part # This assumes the first 'OccupancyIdentifier' after 'OccupancyType' is the primary
|
||||
return part.rstrip(
|
||||
",") # This assumes the first 'OccupancyIdentifier' after 'OccupancyType' is the primary
|
||||
# number
|
||||
|
||||
# Fallback to 'AddressNumber' if no 'OccupancyIdentifier' is found
|
||||
|
|
|
|||
|
|
@ -48,3 +48,12 @@ class TestSearchEpcIntegration:
|
|||
assert epc_searcher.newest_epc["lmk-key"] == lmk_key
|
||||
assert epc_searcher.newest_epc["uprn"] == uprn
|
||||
assert len(epc_searcher.older_epcs) == n_old_epcs
|
||||
|
||||
def test_search_housenumber(self):
|
||||
eg1 = 'Flat A11, Mortimer House, Grendon Road, Exeter'
|
||||
res1 = SearchEpc.get_house_number(eg1, None)
|
||||
assert res1 == "A11"
|
||||
|
||||
eg2 = 'Flat A9, Mortimer House, Grendon Road, Exeter, EX1 2NL'
|
||||
res2 = SearchEpc.get_house_number(eg2, None)
|
||||
assert res2 == "A9"
|
||||
|
|
|
|||
|
|
@ -352,9 +352,11 @@ def app():
|
|||
sheet_name=SHEET_NAME,
|
||||
address1_colname=ADDRESS1_COLUMN,
|
||||
postcode_colname=POSTCODE_COLUMN,
|
||||
landlord_property_id="UPRN",
|
||||
full_address_colname=FULLADDRESS_COLUMN,
|
||||
full_address_cols_to_concat=ADDRESS_COLS_TO_CONCAT,
|
||||
missing_postcodes_method=MISSING_POSTCODES_METHOD,
|
||||
address1_extraction_method=ADDRESS1_METHOD,
|
||||
landlord_year_built=PROPERTY_YEAR_BUILT,
|
||||
landlord_uprn=UPRN_COLUMN,
|
||||
landlord_property_type=PROPERTY_TYPE_COLUMN,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue