diff --git a/.idea/Model.iml b/.idea/Model.iml
index 762580d9..96ad7a95 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -7,7 +7,7 @@
-
+
diff --git a/.idea/misc.xml b/.idea/misc.xml
index c916a158..fb10c6b0 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -3,7 +3,7 @@
-
+
diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py
index 2a16e82f..35da9c3b 100644
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@@ -1,5 +1,10 @@
import os
+import usaddress
import pandas as pd
+from utils.logger import setup_logger
+from backend.SearchEpc import SearchEpc
+
+logger = setup_logger()
class AssetList:
@@ -15,6 +20,15 @@ class AssetList:
"address1_extraction" # This method will use the NLP model to extract address1
]
+ STANDARD_PROPERTY_TYPES = [
+ "house",
+ "flat",
+ "bungalow",
+ "maisonette",
+ "park home",
+ "block house",
+ ]
+
def __init__(
self,
local_filepath,
@@ -26,6 +40,10 @@ class AssetList:
missing_postcodes_method=None,
landlord_year_built=None,
landlord_uprn=None,
+ landlord_property_type=None,
+ landlord_wall_construction=None,
+ landlord_heating_system=None,
+ landlord_existing_pv=None,
header=0
):
self.local_filepath = local_filepath
@@ -43,21 +61,72 @@ class AssetList:
self.full_address_colname = full_address_colname
self.landlord_year_built = landlord_year_built
self.landlord_uprn = landlord_uprn
+ self.landlord_property_type = landlord_property_type
+ self.landlord_wall_construction = landlord_wall_construction
+ self.landlord_heating_system = landlord_heating_system
+ self.landlord_existing_pv = landlord_existing_pv
# parameters for cleaning
self.full_address_cols_to_concat = full_address_cols_to_concat
self.missing_postcodes_method = missing_postcodes_method
+ self.debug_information = {
+ "property_type": None,
+ "wall_construction": None,
+ "heating_system": None,
+ "existing_pv": None
+ }
+
+ @classmethod
+ def _extract_address1(cls, asset_list, full_address_col, postcode_col, method="first_two_words"):
+
+ if method not in cls.ADDRESS_1_CLEANING_METHODS:
+ raise ValueError(f"Method {method} for producing address1 not recognized")
+
+ if method == "first_two_words":
+ asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ")
+ return asset_list
+
+ if method == "first_word":
+ asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0]
+ return asset_list
+
+ if method == "house_number_extraction":
+ asset_list["address1_extracted"] = asset_list.apply(
+ lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]),
+ axis=1
+ )
+ return asset_list
+
+ if method == "address1_extraction":
+
+ x = asset_list_df[FULLADDRESS_COLUMN].values[0]
+ parsed = usaddress.parse(x)
+
+ def extract_address_1():
+
+
+ raise ValueError(f"Method {method} not recognized")
+
+ @staticmethod
+ def _address1_extraction(x):
+
+
def standardise(self):
"""
This function is used to standardise the asset list
:return: standardised asset list
"""
+ if self.address1_colname is None:
+ # If we do not have this, we produce it
+
+
# We keep just the columns we care about and will work through the various columns and standardise
self.standardised_asset_list = self.raw_asset_list[
[
-
+ self.address1_colname, self.postcode_colname, self.full_address_colname,
+ self.landlord_year_built, self.landlord_uprn, self.landlord_property_type
]
]
diff --git a/asset_list/README.md b/asset_list/README.md
new file mode 100644
index 00000000..1bf734a4
--- /dev/null
+++ b/asset_list/README.md
@@ -0,0 +1,172 @@
+# libpostal Installation Guide for macOS M1
+
+## Overview
+
+`libpostal` is a fast, open-source address parsing and normalization library, designed for global addresses. This guide
+provides detailed steps to install `libpostal` on macOS with Apple Silicon (M1/M2) and use it with Python.
+
+---
+
+## π Prerequisites
+
+Before installing `libpostal`, ensure you have the necessary dependencies installed.
+
+### **1οΈβ£ Install Required Dependencies**
+
+Open a terminal and run:
+
+```bash
+brew install curl autoconf automake libtool pkg-config
+```
+
+### **2οΈβ£ Clone the libpostal Repository**
+
+```bash
+git clone https://github.com/openvenues/libpostal.git
+cd libpostal
+```
+
+### **3οΈβ£ Run Bootstrap Script**
+
+```bash
+./bootstrap.sh
+```
+
+### **4οΈβ£ Configure the Build (Important for M1 Macs)**
+
+Since M1 chips do not support SSE2 natively, you **must** disable SSE2 for compatibility.
+
+```bash
+./configure --disable-sse2 --datadir=/usr/local/libpostal_data
+```
+
+*(You can replace `/usr/local/libpostal_data` with another directory that has a few GB of space.)*
+
+### **5οΈβ£ Compile and Install**
+
+```bash
+make -j$(sysctl -n hw.ncpu)
+sudo make install
+```
+
+### **6οΈβ£ Install Python Bindings**
+
+Once `libpostal` is installed, install the Python package:
+
+```bash
+pip install postal
+```
+
+---
+
+## β
**Verify Installation**
+
+To check if `libpostal` was installed successfully, run:
+
+```bash
+python -c "import postal; print(postal.parser.parse('23 Clifton Hill, Newtown, Exeter, EX1 2DL'))"
+```
+
+**Expected Output:**
+
+```
+[('23', 'house_number'), ('Clifton Hill', 'road'), ('Newtown', 'city'), ('Exeter', 'city'), ('EX1 2DL', 'postcode')]
+```
+
+---
+
+## π **Usage Example in Python**
+
+### **Address Parsing**
+
+```python
+from postal.parser import parse
+
+address = "23 Clifton Hill, Newtown, Exeter, EX1 2DL"
+parsed_address = dict(parse(address))
+
+print(parsed_address)
+```
+
+**Expected Output:**
+
+```python
+{
+ 'house_number': '23',
+ 'road': 'Clifton Hill',
+ 'city': 'Newtown',
+ 'city': 'Exeter',
+ 'postcode': 'EX1 2DL'
+}
+```
+
+### **Address Normalization**
+
+```python
+from postal.normalize import normalize_string
+
+address = "Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL"
+normalized = normalize_string(address)
+
+print(normalized)
+```
+
+---
+
+## π **Troubleshooting**
+
+### **1οΈβ£ libpostal Not Found?**
+
+If you encounter an error like `ModuleNotFoundError: No module named 'postal'`, make sure:
+
+- You ran `sudo make install`
+- Your Python environment recognizes `postal`. Try:
+ ```bash
+ pip install postal --no-cache-dir
+ ```
+- If using a virtual environment (`venv`), activate it before running Python.
+
+### **2οΈβ£ Compilation Issues on macOS?**
+
+If `make` fails, try running:
+
+```bash
+brew reinstall autoconf automake libtool pkg-config
+```
+
+Then restart the installation process.
+
+### **3οΈβ£ Can't Find libpostal Data Directory?**
+
+Ensure `libpostal_data` exists in the correct directory:
+
+```bash
+ls /usr/local/libpostal_data
+```
+
+If missing, re-run `./configure` with the correct path.
+
+---
+
+## π **Uninstallation**
+
+To remove `libpostal`, run:
+
+```bash
+sudo rm -rf /usr/local/lib/libpostal*
+sudo rm -rf /usr/local/include/libpostal*
+rm -rf ~/libpostal
+pip uninstall postal
+```
+
+---
+
+## π **Additional Resources**
+
+- [Libpostal GitHub](https://github.com/openvenues/libpostal)
+- [Libpostal Python Bindings](https://pypi.org/project/postal/)
+- [Homebrew](https://brew.sh/)
+
+---
+
+### π Youβre all set! Now you can use `libpostal` to parse and clean address data efficiently. π
diff --git a/asset_list/requirements.txt b/asset_list/requirements.txt
new file mode 100644
index 00000000..d77c8a58
--- /dev/null
+++ b/asset_list/requirements.txt
@@ -0,0 +1,3 @@
+postal
+pandas
+usaddress
\ No newline at end of file
diff --git a/asset_list/tests/test_standardisation.py b/asset_list/tests/test_standardisation.py
new file mode 100644
index 00000000..f0e6ce11
--- /dev/null
+++ b/asset_list/tests/test_standardisation.py
@@ -0,0 +1,9 @@
+from asset_list.AssetList import AssetList
+
+
+def test_address1_extraction():
+ example = 'Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL'
+
+ AssetList._extract_address1(
+ example,
+ )
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
index 06082774..74dc28e0 100644
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@@ -346,10 +346,24 @@ def app():
invalid_property_types_dictionary = ["bedsit", "bed-sit", "bed sit"]
- asset_list = AssetList(
+ self = AssetList(
local_filepath=os.path.join(DATA_FOLDER, DATA_FILENAME),
header=0,
- sheet_name=SHEET_NAME
+ sheet_name=SHEET_NAME,
+ address1_colname=ADDRESS1_COLUMN,
+ postcode_colname=POSTCODE_COLUMN,
+ full_address_colname=FULLADDRESS_COLUMN,
+ full_address_cols_to_concat=ADDRESS_COLS_TO_CONCAT,
+ missing_postcodes_method=MISSING_POSTCODES_METHOD,
+ landlord_year_built=PROPERTY_YEAR_BUILT,
+ landlord_uprn=UPRN_COLUMN,
+ landlord_property_type=PROPERTY_TYPE_COLUMN,
+ landlord_wall_construction="Wall Construction (EPC)",
+ landlord_heating_system="Heat Source",
+ landlord_existing_pv="PV (Y/N)"
+ )
+ self.standardised_asset_list(
+ # In here, we might want to pass some specific remaps
)
# DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester"