modifying creation of ids

This commit is contained in:
Khalim Conn-Kowlessar 2025-02-19 22:51:48 +00:00
parent 776285dd15
commit 75e7c13a29
2 changed files with 36 additions and 14 deletions

View file

@ -1,3 +1,4 @@
import hashlib
import os
import re
import tiktoken
@ -324,11 +325,24 @@ class AssetList:
We want all figures to be positive
:return:
"""
import sys
# We'll remove punctuation and whitespace from the address, before hashing to produce an ID
def _make_hash(value):
"""Generates a stable SHA256 hash suffix and appends it to a cleaned version of the value."""
# Normalize and remove special characters for cleaner ID
cleaned_value = re.sub(r"[^\w\s-]", "", value).replace(" ", "_").lower()
# Generate SHA-256 hash and truncate it
short_hash = hashlib.sha256(value.encode()).hexdigest()[:12]
return f"{cleaned_value}-{short_hash}"
# Apply transformation
self.standardised_asset_list[self.DOMNA_PROPERTY_ID] = (
self.standardised_asset_list[self.full_address_colname] + self.standardised_asset_list[
self.postcode_colname]
).apply(lambda x: hash(x) % 2 ** sys.hash_info.width)
self.standardised_asset_list[self.full_address_colname] +
self.standardised_asset_list[self.postcode_colname]
).str.strip().str.replace(r"[^\w\s]", "", regex=True).str.replace(" ", "").str.lower().apply(_make_hash)
@staticmethod
def _strip_postcode_from_full_address(full_address, postcode):
@ -509,5 +523,20 @@ class AssetList:
if not self.variable_mappings and not override_empty_mappings:
raise ValueError("Please run init_standardise first")
logger.info("Applying standardisation to asset list")
for variable, mapping in self.variable_mappings.items():
self.standardised_asset_list[variable] = self.standardised_asset_list[variable].map(mapping)
if self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum():
# Drop the dupes
pprint(
f"There are {self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum()} duplicated "
f"addresses - dropping"
)
self.standardised_asset_list = self.standardised_asset_list[
~self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated()
]
def create_lookup_mappings(self):
pass

View file

@ -344,7 +344,8 @@ def app():
HAS_NON_INTRUSIVES = True
PROPERTY_TYPE_COLUMN = "Location type" # This will be used to identify and remove bedsits
invalid_property_types_dictionary = ["bedsit", "bed-sit", "bed sit"]
# Maps addresses to uprn in problematic cases
MANUAL_UPRN_MAP = {}
asset_list = AssetList(
local_filepath=os.path.join(DATA_FOLDER, DATA_FILENAME),
@ -366,7 +367,7 @@ def app():
)
asset_list.init_standardise()
self.apply_transformations()
asset_list.apply_standardiation()
# DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester"
# DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx"
@ -382,9 +383,6 @@ def app():
# # If we have the non-intrusives data, this should be true
# HAS_NON_INTRUSIVES = True
# Maps addresses to uprn in problematic cases
MANUAL_UPRN_MAP = {}
asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME)
if MISSING_POSTCODES_METHOD is not None:
@ -464,11 +462,6 @@ def app():
# We check for duplicated addresses
asset_list["deduper"] = asset_list[FULLADDRESS_COLUMN] + asset_list[POSTCODE_COLUMN]
if asset_list["deduper"].duplicated().sum():
# Drop the dupes
print(f"There are {asset_list['deduper'].duplicated().sum()} duplicated addresses - dropping")
asset_list = asset_list[~asset_list["deduper"].duplicated()]
asset_list = asset_list.drop(columns=["deduper"])
# We chunk up this data into 5000 rows at a time
# Create the chunks directory