modifying creation of ids

2026-07-27 23:35:01 +00:00 · 2025-02-19 22:51:48 +00:00 · 2025-02-19 22:51:48 +00:00 · 75e7c13a29
commit 75e7c13a29
parent 776285dd15
2 changed files with 36 additions and 14 deletions
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@ -1,3 +1,4 @@
+import hashlib
 import os
 import re
 import tiktoken
@ -324,11 +325,24 @@ class AssetList:
        We want all figures to be positive
        :return:
        """
-        import sys
+
+        # We'll remove punctuation and whitespace from the address, before hashing to produce an ID
+
+        def _make_hash(value):
+            """Generates a stable SHA256 hash suffix and appends it to a cleaned version of the value."""
+            # Normalize and remove special characters for cleaner ID
+            cleaned_value = re.sub(r"[^\w\s-]", "", value).replace(" ", "_").lower()
+
+            # Generate SHA-256 hash and truncate it
+            short_hash = hashlib.sha256(value.encode()).hexdigest()[:12]
+
+            return f"{cleaned_value}-{short_hash}"
+
+        # Apply transformation
        self.standardised_asset_list[self.DOMNA_PROPERTY_ID] = (
-            self.standardised_asset_list[self.full_address_colname] + self.standardised_asset_list[
-            self.postcode_colname]
-        ).apply(lambda x: hash(x) % 2 ** sys.hash_info.width)
+            self.standardised_asset_list[self.full_address_colname] +
+            self.standardised_asset_list[self.postcode_colname]
+        ).str.strip().str.replace(r"[^\w\s]", "", regex=True).str.replace(" ", "").str.lower().apply(_make_hash)

    @staticmethod
    def _strip_postcode_from_full_address(full_address, postcode):
@ -509,5 +523,20 @@ class AssetList:
        if not self.variable_mappings and not override_empty_mappings:
            raise ValueError("Please run init_standardise first")

+        logger.info("Applying standardisation to asset list")
+
+        for variable, mapping in self.variable_mappings.items():
+            self.standardised_asset_list[variable] = self.standardised_asset_list[variable].map(mapping)
+
+        if self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum():
+            # Drop the dupes
+            pprint(
+                f"There are {self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum()} duplicated "
+                f"addresses - dropping"
+            )
+            self.standardised_asset_list = self.standardised_asset_list[
+                ~self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated()
+            ]
+
    def create_lookup_mappings(self):
        pass
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@ -344,7 +344,8 @@ def app():
    HAS_NON_INTRUSIVES = True
    PROPERTY_TYPE_COLUMN = "Location type"  # This will be used to identify and remove bedsits

-    invalid_property_types_dictionary = ["bedsit", "bed-sit", "bed sit"]
+    # Maps addresses to uprn in problematic cases
+    MANUAL_UPRN_MAP = {}

    asset_list = AssetList(
        local_filepath=os.path.join(DATA_FOLDER, DATA_FILENAME),
@ -366,7 +367,7 @@ def app():
    )
    asset_list.init_standardise()

-    self.apply_transformations()
+    asset_list.apply_standardiation()

    # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester"
    # DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx"
@ -382,9 +383,6 @@ def app():
    # # If we have the non-intrusives data, this should be true
    # HAS_NON_INTRUSIVES = True

-    # Maps addresses to uprn in problematic cases
-    MANUAL_UPRN_MAP = {}
-
    asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME)

    if MISSING_POSTCODES_METHOD is not None:
@ -464,11 +462,6 @@ def app():

    # We check for duplicated addresses
    asset_list["deduper"] = asset_list[FULLADDRESS_COLUMN] + asset_list[POSTCODE_COLUMN]
-    if asset_list["deduper"].duplicated().sum():
-        # Drop the dupes
-        print(f"There are {asset_list['deduper'].duplicated().sum()} duplicated addresses - dropping")
-        asset_list = asset_list[~asset_list["deduper"].duplicated()]
-    asset_list = asset_list.drop(columns=["deduper"])

    # We chunk up this data into 5000 rows at a time
    # Create the chunks directory