mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
modifying creation of ids
This commit is contained in:
parent
776285dd15
commit
75e7c13a29
2 changed files with 36 additions and 14 deletions
|
|
@ -1,3 +1,4 @@
|
|||
import hashlib
|
||||
import os
|
||||
import re
|
||||
import tiktoken
|
||||
|
|
@ -324,11 +325,24 @@ class AssetList:
|
|||
We want all figures to be positive
|
||||
:return:
|
||||
"""
|
||||
import sys
|
||||
|
||||
# We'll remove punctuation and whitespace from the address, before hashing to produce an ID
|
||||
|
||||
def _make_hash(value):
|
||||
"""Generates a stable SHA256 hash suffix and appends it to a cleaned version of the value."""
|
||||
# Normalize and remove special characters for cleaner ID
|
||||
cleaned_value = re.sub(r"[^\w\s-]", "", value).replace(" ", "_").lower()
|
||||
|
||||
# Generate SHA-256 hash and truncate it
|
||||
short_hash = hashlib.sha256(value.encode()).hexdigest()[:12]
|
||||
|
||||
return f"{cleaned_value}-{short_hash}"
|
||||
|
||||
# Apply transformation
|
||||
self.standardised_asset_list[self.DOMNA_PROPERTY_ID] = (
|
||||
self.standardised_asset_list[self.full_address_colname] + self.standardised_asset_list[
|
||||
self.postcode_colname]
|
||||
).apply(lambda x: hash(x) % 2 ** sys.hash_info.width)
|
||||
self.standardised_asset_list[self.full_address_colname] +
|
||||
self.standardised_asset_list[self.postcode_colname]
|
||||
).str.strip().str.replace(r"[^\w\s]", "", regex=True).str.replace(" ", "").str.lower().apply(_make_hash)
|
||||
|
||||
@staticmethod
|
||||
def _strip_postcode_from_full_address(full_address, postcode):
|
||||
|
|
@ -509,5 +523,20 @@ class AssetList:
|
|||
if not self.variable_mappings and not override_empty_mappings:
|
||||
raise ValueError("Please run init_standardise first")
|
||||
|
||||
logger.info("Applying standardisation to asset list")
|
||||
|
||||
for variable, mapping in self.variable_mappings.items():
|
||||
self.standardised_asset_list[variable] = self.standardised_asset_list[variable].map(mapping)
|
||||
|
||||
if self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum():
|
||||
# Drop the dupes
|
||||
pprint(
|
||||
f"There are {self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum()} duplicated "
|
||||
f"addresses - dropping"
|
||||
)
|
||||
self.standardised_asset_list = self.standardised_asset_list[
|
||||
~self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated()
|
||||
]
|
||||
|
||||
def create_lookup_mappings(self):
|
||||
pass
|
||||
|
|
|
|||
|
|
@ -344,7 +344,8 @@ def app():
|
|||
HAS_NON_INTRUSIVES = True
|
||||
PROPERTY_TYPE_COLUMN = "Location type" # This will be used to identify and remove bedsits
|
||||
|
||||
invalid_property_types_dictionary = ["bedsit", "bed-sit", "bed sit"]
|
||||
# Maps addresses to uprn in problematic cases
|
||||
MANUAL_UPRN_MAP = {}
|
||||
|
||||
asset_list = AssetList(
|
||||
local_filepath=os.path.join(DATA_FOLDER, DATA_FILENAME),
|
||||
|
|
@ -366,7 +367,7 @@ def app():
|
|||
)
|
||||
asset_list.init_standardise()
|
||||
|
||||
self.apply_transformations()
|
||||
asset_list.apply_standardiation()
|
||||
|
||||
# DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester"
|
||||
# DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx"
|
||||
|
|
@ -382,9 +383,6 @@ def app():
|
|||
# # If we have the non-intrusives data, this should be true
|
||||
# HAS_NON_INTRUSIVES = True
|
||||
|
||||
# Maps addresses to uprn in problematic cases
|
||||
MANUAL_UPRN_MAP = {}
|
||||
|
||||
asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME)
|
||||
|
||||
if MISSING_POSTCODES_METHOD is not None:
|
||||
|
|
@ -464,11 +462,6 @@ def app():
|
|||
|
||||
# We check for duplicated addresses
|
||||
asset_list["deduper"] = asset_list[FULLADDRESS_COLUMN] + asset_list[POSTCODE_COLUMN]
|
||||
if asset_list["deduper"].duplicated().sum():
|
||||
# Drop the dupes
|
||||
print(f"There are {asset_list['deduper'].duplicated().sum()} duplicated addresses - dropping")
|
||||
asset_list = asset_list[~asset_list["deduper"].duplicated()]
|
||||
asset_list = asset_list.drop(columns=["deduper"])
|
||||
|
||||
# We chunk up this data into 5000 rows at a time
|
||||
# Create the chunks directory
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue