diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 87402924..b153b624 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -1,3 +1,4 @@ +import hashlib import os import re import tiktoken @@ -324,11 +325,24 @@ class AssetList: We want all figures to be positive :return: """ - import sys + + # We'll remove punctuation and whitespace from the address, before hashing to produce an ID + + def _make_hash(value): + """Generates a stable SHA256 hash suffix and appends it to a cleaned version of the value.""" + # Normalize and remove special characters for cleaner ID + cleaned_value = re.sub(r"[^\w\s-]", "", value).replace(" ", "_").lower() + + # Generate SHA-256 hash and truncate it + short_hash = hashlib.sha256(value.encode()).hexdigest()[:12] + + return f"{cleaned_value}-{short_hash}" + + # Apply transformation self.standardised_asset_list[self.DOMNA_PROPERTY_ID] = ( - self.standardised_asset_list[self.full_address_colname] + self.standardised_asset_list[ - self.postcode_colname] - ).apply(lambda x: hash(x) % 2 ** sys.hash_info.width) + self.standardised_asset_list[self.full_address_colname] + + self.standardised_asset_list[self.postcode_colname] + ).str.strip().str.replace(r"[^\w\s]", "", regex=True).str.replace(" ", "").str.lower().apply(_make_hash) @staticmethod def _strip_postcode_from_full_address(full_address, postcode): @@ -509,5 +523,20 @@ class AssetList: if not self.variable_mappings and not override_empty_mappings: raise ValueError("Please run init_standardise first") + logger.info("Applying standardisation to asset list") + + for variable, mapping in self.variable_mappings.items(): + self.standardised_asset_list[variable] = self.standardised_asset_list[variable].map(mapping) + + if self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum(): + # Drop the dupes + pprint( + f"There are {self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum()} duplicated " + f"addresses - dropping" + ) + self.standardised_asset_list = self.standardised_asset_list[ + ~self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated() + ] + def create_lookup_mappings(self): pass diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 1289fb09..54ae2280 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -344,7 +344,8 @@ def app(): HAS_NON_INTRUSIVES = True PROPERTY_TYPE_COLUMN = "Location type" # This will be used to identify and remove bedsits - invalid_property_types_dictionary = ["bedsit", "bed-sit", "bed sit"] + # Maps addresses to uprn in problematic cases + MANUAL_UPRN_MAP = {} asset_list = AssetList( local_filepath=os.path.join(DATA_FOLDER, DATA_FILENAME), @@ -366,7 +367,7 @@ def app(): ) asset_list.init_standardise() - self.apply_transformations() + asset_list.apply_standardiation() # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" # DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx" @@ -382,9 +383,6 @@ def app(): # # If we have the non-intrusives data, this should be true # HAS_NON_INTRUSIVES = True - # Maps addresses to uprn in problematic cases - MANUAL_UPRN_MAP = {} - asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME) if MISSING_POSTCODES_METHOD is not None: @@ -464,11 +462,6 @@ def app(): # We check for duplicated addresses asset_list["deduper"] = asset_list[FULLADDRESS_COLUMN] + asset_list[POSTCODE_COLUMN] - if asset_list["deduper"].duplicated().sum(): - # Drop the dupes - print(f"There are {asset_list['deduper'].duplicated().sum()} duplicated addresses - dropping") - asset_list = asset_list[~asset_list["deduper"].duplicated()] - asset_list = asset_list.drop(columns=["deduper"]) # We chunk up this data into 5000 rows at a time # Create the chunks directory