mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
stonewater checking data
This commit is contained in:
parent
0c1ef69fba
commit
09a3d01e90
3 changed files with 292 additions and 31 deletions
|
|
@ -192,10 +192,11 @@ class SearchEpc:
|
|||
self.fast = fast
|
||||
|
||||
@staticmethod
|
||||
def get_house_number(address: str) -> str | None:
|
||||
def get_house_number(address: str, postcode=None) -> str | None:
|
||||
"""
|
||||
This method uses the usaddress library to parse an address and extract the primary house or flat number.
|
||||
"""
|
||||
|
||||
try:
|
||||
# Updated regex to catch house numbers including alphanumeric ones
|
||||
pattern = r'(?i)(?:flat|apartment)\s*(\d+\w*)|^\s*(\d+\w*)'
|
||||
|
|
@ -207,6 +208,11 @@ class SearchEpc:
|
|||
# First, try to get the 'OccupancyIdentifier' if 'OccupancyType' is detected
|
||||
for part, type_ in parsed:
|
||||
if type_ == 'OccupancyIdentifier':
|
||||
if postcode is not None:
|
||||
if part == postcode.split(" ")[0]:
|
||||
continue
|
||||
if part == postcode.split(" ")[1]:
|
||||
continue
|
||||
return part # This assumes the first 'OccupancyIdentifier' after 'OccupancyType' is the primary
|
||||
# number
|
||||
|
||||
|
|
@ -216,7 +222,7 @@ class SearchEpc:
|
|||
return address_number.replace(",", "") # Remove any trailing commas
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error parsing address: {e}")
|
||||
raise Exception(f"Error parsing address: {e}")
|
||||
|
||||
return None
|
||||
|
||||
|
|
|
|||
165
etl/customers/stonewater/no_matches.py
Normal file
165
etl/customers/stonewater/no_matches.py
Normal file
|
|
@ -0,0 +1,165 @@
|
|||
no_matches = [
|
||||
{
|
||||
'internal_id': 4626, 'full_address': '1 Dean Lane, Sixpenny Handley, Salisbury, SP5 5AS', 'postcode': 'SP5 5AS',
|
||||
'Note': 'No match found - all addresses in this postcode are for Mulberry Court, Sixpenny Handley, Salisbury, '
|
||||
'SP5 5AS, addresses not recognised by Zoopla - possibly the postcode is incorrect and this could be'
|
||||
'Handley Enterprises Ltd, Unit 1 Dean Lane, Sixpenny Handley, Salisbury, SP5 5PA.'
|
||||
'Or this could be 1 Mulberry Court Sixpenny Handley, Salisbury SP5 5AS'
|
||||
},
|
||||
{
|
||||
'internal_id': 4627, 'full_address': '3 Dean Lane, Sixpenny Handley, Salisbury, SP5 5AS', 'postcode': 'SP5 5AS',
|
||||
'Note': 'No match found - all addresses in this postcode are for Mulberry Court, Sixpenny Handley, Salisbury, '
|
||||
'SP5 5AS, addresses not recognised by Zoopla - possibly the postcode is incorrect and this could be'
|
||||
'2 Town Farm House, Dean Lane, Sixpenny Handley, Salisbury, SP5 5PA'
|
||||
'Or this could be 3 Mulberry Court Sixpenny Handley, Salisbury SP5 5AS'
|
||||
},
|
||||
{
|
||||
'internal_id': 4628, 'full_address': '5 Dean Lane, Sixpenny Handley, Salisbury, SP5 5AS', 'postcode': 'SP5 5AS',
|
||||
'Note': 'No match found - all addresses in this postcode are for Mulberry Court, Sixpenny Handley, Salisbury, '
|
||||
'SP5 5AS, addresses not recognised by Zoopla - possibly the postcode is incorrect and this could be'
|
||||
'4 Town Farm House, Dean Lane, Sixpenny Handley, Salisbury, SP5 5PA'
|
||||
'Or this could be 5 Mulberry Court Sixpenny Handley, Salisbury SP5 5AS'
|
||||
},
|
||||
{
|
||||
'internal_id': 544, 'full_address': 'Room 1, Sawr, PO Box 1354, Bedford, MK41 5AB', 'postcode': 'MK41 5AB',
|
||||
"Note": "Postcode deleted in April 2024: https://checkmypostcode.uk/mk415ab"
|
||||
},
|
||||
{
|
||||
'internal_id': 5116, 'full_address': '3 Huntspond Road, Titchfield, Fareham, PO14 4SS', 'postcode': 'PO14 4SS',
|
||||
'Note': 'Is this 3 St Francis Court, 195 Hunts Pond Road, Fareham, PO14 4SS, uprn: 100061988896'
|
||||
},
|
||||
{
|
||||
'internal_id': 5114, 'full_address': '4 Huntspond Road, Titchfield, Fareham, PO14 4SS', 'postcode': 'PO14 4SS',
|
||||
'Note': 'Is this 4 St Francis Court, 195 Hunts Pond Road, Fareham, PO14 4SS, uprn: 100061988897'
|
||||
},
|
||||
{
|
||||
'internal_id': 5115, 'full_address': '2 Huntspond Road, Titchfield, Fareham, PO14 4SS', 'postcode': 'PO14 4SS',
|
||||
'Note': 'Is this 2 St Francis Court, 195 Hunts Pond Road, Fareham, PO14 4SS, uprn: 100061988895'
|
||||
},
|
||||
{
|
||||
'internal_id': 5113, 'full_address': '6 Huntspond Road, Titchfield, Fareham, PO14 4SS', 'postcode': 'PO14 4SS',
|
||||
'Note': 'Is this 6 St Francis Court, 195 Hunts Pond Road, Fareham, PO14 4SS, uprn: 100061988899'
|
||||
},
|
||||
{
|
||||
'internal_id': 5112, 'full_address': '1 Huntspond Road, Titchfield, Fareham, PO14 4SS', 'postcode': 'PO14 4SS',
|
||||
'Note': 'Is this 1 St Francis Court, 195 Hunts Pond Road, Fareham, PO14 4SS, uprn: 100061988894'
|
||||
},
|
||||
{
|
||||
'internal_id': 3846, 'full_address': '2 Beaufort Road, Southbourne, Bournemouth, BH6 5BD',
|
||||
'postcode': 'BH6 5BD',
|
||||
'Note': "2 Beaufort Road, Southbourne, Bournemouth is listed under the postcode BH6 5AL - is there a typo in "
|
||||
"the postcode?"
|
||||
},
|
||||
{
|
||||
'internal_id': 4497, 'full_address': '11 Brokenford Lane, Totton, Southampton, SO40 9LZ',
|
||||
'postcode': 'SO40 9LZ',
|
||||
'Note': "This postcode doesn't appear to exist, closest is 10 brokenford lane, Totton, Southampton, SO40 9DW."
|
||||
"What should this be?"
|
||||
},
|
||||
{
|
||||
'internal_id': 4181, 'full_address': '25a Eastcott Road, Old Town, Swindon, SN1 3PA', 'postcode': 'SN1 3PA',
|
||||
'Note': 'All addresses at this postcode are for Bow Court. '
|
||||
'Closest match is 25 Eastcott Road, Swindon, SN1 3LT, but there is no 25A'
|
||||
},
|
||||
{
|
||||
'internal_id': 5447, 'full_address': '3 Send Road, Send Road, Reading, RG4 8EP', 'postcode': 'RG4 8EP',
|
||||
"Note": "These is no 'Send Road' at this postcode. There are a few possible matches, e.g. Flat 3, "
|
||||
"1 Send Road, RG4 8EH"
|
||||
},
|
||||
{
|
||||
'internal_id': 5449, 'full_address': '5 Send Road, Send Road, Reading, RG4 8EP', 'postcode': 'RG4 8EP',
|
||||
"Note": "Same as for 3 Send Road"
|
||||
},
|
||||
{
|
||||
'internal_id': 5450, 'full_address': '6 Send Road, Send Road, Reading, RG4 8EP', 'postcode': 'RG4 8EP',
|
||||
"Note": "Same as for 3 Send Road"
|
||||
},
|
||||
{
|
||||
'internal_id': 5446, 'full_address': '1 Send Road, Send Road, Reading, RG4 8EP', 'postcode': 'RG4 8EP',
|
||||
"Note": "Same as for 3 Send Road"
|
||||
},
|
||||
{
|
||||
'internal_id': 5448, 'full_address': '4 Send Road, Send Road, Reading, RG4 8EP', 'postcode': 'RG4 8EP',
|
||||
"Note": "Same as for 3 Send Road"
|
||||
},
|
||||
{
|
||||
'internal_id': 5451, 'full_address': '7 Send Road, Send Road, Reading, RG4 8EP', 'postcode': 'RG4 8EP',
|
||||
"Note": "Same as for 3 Send Road"
|
||||
},
|
||||
{
|
||||
'internal_id': 4547, 'full_address': '2 Cecil Terrace, Bemerton, Salisbury, SP2 9NE', 'postcode': 'SP2 9NE',
|
||||
"Note": "Addresses for this postcode are for The Croft, SP2 9NE. Should this be 2 Cecil Terrace SP2 9ND, with"
|
||||
"uprn: 100121039798 ?"
|
||||
},
|
||||
{
|
||||
'internal_id': 4549, 'full_address': '4 Cecil Terrace, Bemerton, Salisbury, SP2 9NE', 'postcode': 'SP2 9NE',
|
||||
"Note": "Addresses for this postcode are for The Croft, SP2 9NE. Should this be 4 Cecil Terrace SP2 9ND?"
|
||||
},
|
||||
{
|
||||
'internal_id': 3601, 'full_address': '20 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX',
|
||||
"Note": "Should this be 20 Constitution Hill Gardens, Poole, BH14 0PY? (i.e. postcode is wrong) "
|
||||
"uprn: 10001086693"
|
||||
},
|
||||
{
|
||||
'internal_id': 3592, 'full_address': '7 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX',
|
||||
"Note": "Should the postcode be BH14 0PY ?"
|
||||
},
|
||||
{
|
||||
'internal_id': 3594, 'full_address': '9 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX',
|
||||
"Note": "Should the postcode be BH14 0PY ?"
|
||||
},
|
||||
{
|
||||
'internal_id': 3591, 'full_address': '6 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX',
|
||||
"Note": "Should the postcode be BH14 0PY ?"
|
||||
},
|
||||
{
|
||||
'internal_id': 3593, 'full_address': '8 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX',
|
||||
"Note": "Should the postcode be BH14 0PY ?"},
|
||||
{
|
||||
'internal_id': 3590, 'full_address': '5 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX',
|
||||
"Note": "Should the postcode be BH14 0PY ?"},
|
||||
{
|
||||
'internal_id': 3589, 'full_address': '3 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX',
|
||||
"Note": "Should the postcode be BH14 0PY ?"},
|
||||
{
|
||||
'internal_id': 3600, 'full_address': '18 Constitution Hill, Parkstone, Poole, BH14 0PX',
|
||||
'postcode': 'BH14 0PX', "Note": "Should the postcode be BH14 0PY ?"},
|
||||
{
|
||||
'internal_id': 3599, 'full_address': '17 Constitution Hill, Parkstone, Poole, BH14 0PX',
|
||||
'postcode': 'BH14 0PX', "Note": "Should the postcode be BH14 0PY ?"},
|
||||
{'internal_id': 3598, 'full_address': '15 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX',
|
||||
"Note": "Should the postcode be BH14 0PY ?"},
|
||||
{'internal_id': 3608, 'full_address': '26 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX',
|
||||
"Note": "Should the postcode be BH14 0PY ?"},
|
||||
{'internal_id': 3610, 'full_address': '30 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX',
|
||||
"Note": "Should the postcode be BH14 0PY ?"},
|
||||
{'internal_id': 3603, 'full_address': '22 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX',
|
||||
"Note": "Should the postcode be BH14 0PY ?"},
|
||||
{'internal_id': 3612, 'full_address': '32 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX',
|
||||
"Note": "Should the postcode be BH14 0PY ?"},
|
||||
{'internal_id': 3595, 'full_address': '10 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX',
|
||||
"Note": "Should the postcode be BH14 0PY ?"},
|
||||
{'internal_id': 3613, 'full_address': '34 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX',
|
||||
"Note": "Should the postcode be BH14 0PY ?"},
|
||||
|
||||
{'internal_id': 3597, 'full_address': '12 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX',
|
||||
"Note": "Should the postcode be BH14 0QB ?"},
|
||||
{'internal_id': 3602, 'full_address': '21 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX',
|
||||
"Note": "Should the postcode be BH14 0QB ?"},
|
||||
{'internal_id': 3606, 'full_address': '19 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX',
|
||||
"Note": "Should the postcode be BH14 0QB ?"},
|
||||
{'internal_id': 3604, 'full_address': '23 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX',
|
||||
"Note": "Should the postcode be BH14 0QB ?"},
|
||||
{'internal_id': 3605, 'full_address': '25 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX',
|
||||
"Note": "Should the postcode be BH14 0QB ?"},
|
||||
{'internal_id': 3609, 'full_address': '29 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX',
|
||||
"Note": "Should the postcode be BH14 0QB ?"},
|
||||
{'internal_id': 3596, 'full_address': '11 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX',
|
||||
"Note": "Should the postcode be BH14 0QB ?"},
|
||||
{'internal_id': 3607, 'full_address': '27 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX',
|
||||
"Note": "Should the postcode be BH14 0QB ?"},
|
||||
{'internal_id': 3611, 'full_address': '31 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX',
|
||||
"Note": "Should the postcode be BH14 0QB ?"},
|
||||
{'internal_id': 5622, 'full_address': '26 Roman Way, Andover, SP10 5HZ', 'postcode': 'SP10 5HZ',
|
||||
'Note': 'Shoul this postcode be SP10 5JU ?'}
|
||||
]
|
||||
|
|
@ -433,28 +433,28 @@ def app():
|
|||
problematic_errors.append(row["internal_id"])
|
||||
|
||||
# Store to S3
|
||||
save_data_to_s3(
|
||||
data=json.dumps(problematic_os),
|
||||
s3_file_name="customers/Stonewater/clustering/problematic_os.json",
|
||||
bucket_name="retrofit-data-dev"
|
||||
)
|
||||
|
||||
save_data_to_s3(
|
||||
data=json.dumps(problematic_os_all),
|
||||
s3_file_name="customers/Stonewater/clustering/problematic_os_all.json",
|
||||
bucket_name="retrofit-data-dev"
|
||||
)
|
||||
|
||||
save_data_to_s3(
|
||||
data=json.dumps(problematic_errors),
|
||||
s3_file_name="customers/Stonewater/clustering/problematic_errors.json",
|
||||
bucket_name="retrofit-data-dev"
|
||||
)
|
||||
# save_data_to_s3(
|
||||
# data=json.dumps(problematic_os),
|
||||
# s3_file_name="customers/Stonewater/clustering/problematic_os.json",
|
||||
# bucket_name="retrofit-data-dev"
|
||||
# )
|
||||
#
|
||||
# save_data_to_s3(
|
||||
# data=json.dumps(problematic_os_all),
|
||||
# s3_file_name="customers/Stonewater/clustering/problematic_os_all.json",
|
||||
# bucket_name="retrofit-data-dev"
|
||||
# )
|
||||
#
|
||||
# save_data_to_s3(
|
||||
# data=json.dumps(problematic_errors),
|
||||
# s3_file_name="customers/Stonewater/clustering/problematic_errors.json",
|
||||
# bucket_name="retrofit-data-dev"
|
||||
# )
|
||||
|
||||
# Next steps: We should collate all of the data and produce 1 big dataset
|
||||
|
||||
problematic_os_df = pd.DataFrame(problematic_os)
|
||||
problematic_address_comparison = problematic[["internal_id", "full_address", "postcode"]].merge(
|
||||
problematic_address_comparison = problematic[["internal_id", "full_address", "postcode", "house_number"]].merge(
|
||||
problematic_os_df[["internal_id", "ADDRESS", "POSTCODE", "UPRN"]],
|
||||
how="inner",
|
||||
on="internal_id"
|
||||
|
|
@ -473,28 +473,50 @@ def app():
|
|||
),
|
||||
axis=1
|
||||
)
|
||||
problematic_address_comparison = problematic_address_comparison.sort_values("match_similarity_score",
|
||||
ascending=True)
|
||||
problematic_address_comparison = problematic_address_comparison.sort_values(
|
||||
"match_similarity_score", ascending=True
|
||||
)
|
||||
|
||||
# let's do a house number extraction
|
||||
problematic_address_comparison["extracted_house_number"] = problematic_address_comparison.apply(
|
||||
lambda x: SearchEpc.get_house_number(x["ADDRESS"], x["OS_POSTCODE"]), axis=1
|
||||
)
|
||||
|
||||
problematic_address_comparison["house_numbers_different"] = (
|
||||
problematic_address_comparison["house_number"].str.lower().str.split(",").str[0].str.split(" ").str[0] !=
|
||||
problematic_address_comparison[
|
||||
"extracted_house_number"].str.lower()
|
||||
)
|
||||
|
||||
# We perform a final check
|
||||
# Take anything where the postcodes don't match, where the house numbers are different and the match similarity
|
||||
# is less than 90, or the match similarity is less than 80
|
||||
final_check = problematic_address_comparison[
|
||||
(problematic_address_comparison["match_similarity_score"] <= 90) |
|
||||
(~problematic_address_comparison["postcodes_match"])
|
||||
]
|
||||
]
|
||||
final_check = final_check.sort_values("match_similarity_score", ascending=False)
|
||||
final_check = final_check.reset_index(drop=True)
|
||||
|
||||
final_best_matches = []
|
||||
no_matches = []
|
||||
for _, row in final_check.iterrows():
|
||||
os_data = problematic_os_all[row["internal_id"]]
|
||||
os_data = pd.DataFrame(
|
||||
[x["DPA"] if "DPA" in x else x["LPI"] for x in os_data]
|
||||
)
|
||||
os_data["postcode"] = np.where(
|
||||
~pd.isnull(os_data["POSTCODE"]),
|
||||
os_data["POSTCODE"],
|
||||
os_data["POSTCODE_LOCATOR"]
|
||||
)
|
||||
|
||||
if ("POSTCODE_LOCATOR" in os_data.columns) and ("POSTCODE" in os_data.columns):
|
||||
os_data["postcode"] = np.where(
|
||||
~pd.isnull(os_data["POSTCODE"]),
|
||||
os_data["POSTCODE"],
|
||||
os_data["POSTCODE_LOCATOR"]
|
||||
)
|
||||
elif "POSTCODE" in os_data.columns:
|
||||
os_data["postcode"] = os_data["POSTCODE"]
|
||||
else:
|
||||
os_data["postcode"] = os_data["POSTCODE_LOCATOR"]
|
||||
os_data = os_data[os_data["postcode"].str.lower() == row["postcode"].lower()]
|
||||
if os_data.shape[0] == 1:
|
||||
if os_data.shape[0] >= 1:
|
||||
final_best_matches.append(
|
||||
{
|
||||
"internal_id": row["internal_id"],
|
||||
|
|
@ -502,4 +524,72 @@ def app():
|
|||
}
|
||||
)
|
||||
else:
|
||||
blah
|
||||
no_matches.append(
|
||||
{
|
||||
"internal_id": row["internal_id"],
|
||||
"full_address": row["full_address"],
|
||||
"postcode": row["postcode"]
|
||||
}
|
||||
)
|
||||
|
||||
no_matches = pd.DataFrame(no_matches)
|
||||
|
||||
# Data to be confirmed
|
||||
from etl.customers.stonewater.no_matches import no_matches
|
||||
no_matches_to_export = pd.DataFrame(no_matches)
|
||||
no_matches_to_export = asset_list.merge(
|
||||
no_matches_to_export[["internal_id", "Note"]],
|
||||
how="inner",
|
||||
on="internal_id"
|
||||
).rename(
|
||||
columns={
|
||||
"internal_id": "Osm. ID",
|
||||
"customer_asset_id": "Org. ref.",
|
||||
"external_address_id": "Address ID",
|
||||
}
|
||||
)
|
||||
no_matches_to_export.to_excel("Stonewater - addresses with no matches.xlsx", index=False)
|
||||
|
||||
# We also confirm final_best_matches
|
||||
final_best_matches_df = pd.DataFrame(final_best_matches)[
|
||||
["internal_id", "ADDRESS", "UPRN"]
|
||||
].rename(
|
||||
columns={
|
||||
"ADDRESS": "Ordnance Survey Address - same postcode (best match)",
|
||||
"UPRN": "UPRN - same postcode (best match)"
|
||||
}
|
||||
)
|
||||
# We also get their original match
|
||||
final_best_matches_df = final_best_matches_df.merge(
|
||||
problematic[["internal_id", "ADDRESS", "UPRN"]].rename(
|
||||
columns={
|
||||
"ADDRESS": "Ordnance Survey Address - best possible match",
|
||||
"UPRN": "UPRN - best possible match"
|
||||
}
|
||||
),
|
||||
how="inner",
|
||||
on="internal_id"
|
||||
)
|
||||
|
||||
# merge on the original data
|
||||
final_best_matches_df = asset_list.merge(
|
||||
final_best_matches_df,
|
||||
how="inner",
|
||||
on="internal_id"
|
||||
).rename(
|
||||
columns={
|
||||
"internal_id": "Osm. ID",
|
||||
"customer_asset_id": "Org. ref.",
|
||||
"external_address_id": "Address ID",
|
||||
}
|
||||
)
|
||||
|
||||
# "Osm. ID": "internal_id",
|
||||
# "Org. ref.": "customer_asset_id",
|
||||
# "Postcode": "postcode",
|
||||
# "House no": "house_number",
|
||||
# "Name": "address1",
|
||||
# "Address line 2": "address2",
|
||||
# "City/Town": "city_town",
|
||||
# "County": "county",
|
||||
# "Address ID": "external_address_id",
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue