diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index 62ae307f..9724ffd1 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -192,10 +192,11 @@ class SearchEpc: self.fast = fast @staticmethod - def get_house_number(address: str) -> str | None: + def get_house_number(address: str, postcode=None) -> str | None: """ This method uses the usaddress library to parse an address and extract the primary house or flat number. """ + try: # Updated regex to catch house numbers including alphanumeric ones pattern = r'(?i)(?:flat|apartment)\s*(\d+\w*)|^\s*(\d+\w*)' @@ -207,6 +208,11 @@ class SearchEpc: # First, try to get the 'OccupancyIdentifier' if 'OccupancyType' is detected for part, type_ in parsed: if type_ == 'OccupancyIdentifier': + if postcode is not None: + if part == postcode.split(" ")[0]: + continue + if part == postcode.split(" ")[1]: + continue return part # This assumes the first 'OccupancyIdentifier' after 'OccupancyType' is the primary # number @@ -216,7 +222,7 @@ class SearchEpc: return address_number.replace(",", "") # Remove any trailing commas except Exception as e: - print(f"Error parsing address: {e}") + raise Exception(f"Error parsing address: {e}") return None diff --git a/etl/customers/stonewater/no_matches.py b/etl/customers/stonewater/no_matches.py new file mode 100644 index 00000000..e7c122b1 --- /dev/null +++ b/etl/customers/stonewater/no_matches.py @@ -0,0 +1,165 @@ +no_matches = [ + { + 'internal_id': 4626, 'full_address': '1 Dean Lane, Sixpenny Handley, Salisbury, SP5 5AS', 'postcode': 'SP5 5AS', + 'Note': 'No match found - all addresses in this postcode are for Mulberry Court, Sixpenny Handley, Salisbury, ' + 'SP5 5AS, addresses not recognised by Zoopla - possibly the postcode is incorrect and this could be' + 'Handley Enterprises Ltd, Unit 1 Dean Lane, Sixpenny Handley, Salisbury, SP5 5PA.' + 'Or this could be 1 Mulberry Court Sixpenny Handley, Salisbury SP5 5AS' + }, + { + 'internal_id': 4627, 'full_address': '3 Dean Lane, Sixpenny Handley, Salisbury, SP5 5AS', 'postcode': 'SP5 5AS', + 'Note': 'No match found - all addresses in this postcode are for Mulberry Court, Sixpenny Handley, Salisbury, ' + 'SP5 5AS, addresses not recognised by Zoopla - possibly the postcode is incorrect and this could be' + '2 Town Farm House, Dean Lane, Sixpenny Handley, Salisbury, SP5 5PA' + 'Or this could be 3 Mulberry Court Sixpenny Handley, Salisbury SP5 5AS' + }, + { + 'internal_id': 4628, 'full_address': '5 Dean Lane, Sixpenny Handley, Salisbury, SP5 5AS', 'postcode': 'SP5 5AS', + 'Note': 'No match found - all addresses in this postcode are for Mulberry Court, Sixpenny Handley, Salisbury, ' + 'SP5 5AS, addresses not recognised by Zoopla - possibly the postcode is incorrect and this could be' + '4 Town Farm House, Dean Lane, Sixpenny Handley, Salisbury, SP5 5PA' + 'Or this could be 5 Mulberry Court Sixpenny Handley, Salisbury SP5 5AS' + }, + { + 'internal_id': 544, 'full_address': 'Room 1, Sawr, PO Box 1354, Bedford, MK41 5AB', 'postcode': 'MK41 5AB', + "Note": "Postcode deleted in April 2024: https://checkmypostcode.uk/mk415ab" + }, + { + 'internal_id': 5116, 'full_address': '3 Huntspond Road, Titchfield, Fareham, PO14 4SS', 'postcode': 'PO14 4SS', + 'Note': 'Is this 3 St Francis Court, 195 Hunts Pond Road, Fareham, PO14 4SS, uprn: 100061988896' + }, + { + 'internal_id': 5114, 'full_address': '4 Huntspond Road, Titchfield, Fareham, PO14 4SS', 'postcode': 'PO14 4SS', + 'Note': 'Is this 4 St Francis Court, 195 Hunts Pond Road, Fareham, PO14 4SS, uprn: 100061988897' + }, + { + 'internal_id': 5115, 'full_address': '2 Huntspond Road, Titchfield, Fareham, PO14 4SS', 'postcode': 'PO14 4SS', + 'Note': 'Is this 2 St Francis Court, 195 Hunts Pond Road, Fareham, PO14 4SS, uprn: 100061988895' + }, + { + 'internal_id': 5113, 'full_address': '6 Huntspond Road, Titchfield, Fareham, PO14 4SS', 'postcode': 'PO14 4SS', + 'Note': 'Is this 6 St Francis Court, 195 Hunts Pond Road, Fareham, PO14 4SS, uprn: 100061988899' + }, + { + 'internal_id': 5112, 'full_address': '1 Huntspond Road, Titchfield, Fareham, PO14 4SS', 'postcode': 'PO14 4SS', + 'Note': 'Is this 1 St Francis Court, 195 Hunts Pond Road, Fareham, PO14 4SS, uprn: 100061988894' + }, + { + 'internal_id': 3846, 'full_address': '2 Beaufort Road, Southbourne, Bournemouth, BH6 5BD', + 'postcode': 'BH6 5BD', + 'Note': "2 Beaufort Road, Southbourne, Bournemouth is listed under the postcode BH6 5AL - is there a typo in " + "the postcode?" + }, + { + 'internal_id': 4497, 'full_address': '11 Brokenford Lane, Totton, Southampton, SO40 9LZ', + 'postcode': 'SO40 9LZ', + 'Note': "This postcode doesn't appear to exist, closest is 10 brokenford lane, Totton, Southampton, SO40 9DW." + "What should this be?" + }, + { + 'internal_id': 4181, 'full_address': '25a Eastcott Road, Old Town, Swindon, SN1 3PA', 'postcode': 'SN1 3PA', + 'Note': 'All addresses at this postcode are for Bow Court. ' + 'Closest match is 25 Eastcott Road, Swindon, SN1 3LT, but there is no 25A' + }, + { + 'internal_id': 5447, 'full_address': '3 Send Road, Send Road, Reading, RG4 8EP', 'postcode': 'RG4 8EP', + "Note": "These is no 'Send Road' at this postcode. There are a few possible matches, e.g. Flat 3, " + "1 Send Road, RG4 8EH" + }, + { + 'internal_id': 5449, 'full_address': '5 Send Road, Send Road, Reading, RG4 8EP', 'postcode': 'RG4 8EP', + "Note": "Same as for 3 Send Road" + }, + { + 'internal_id': 5450, 'full_address': '6 Send Road, Send Road, Reading, RG4 8EP', 'postcode': 'RG4 8EP', + "Note": "Same as for 3 Send Road" + }, + { + 'internal_id': 5446, 'full_address': '1 Send Road, Send Road, Reading, RG4 8EP', 'postcode': 'RG4 8EP', + "Note": "Same as for 3 Send Road" + }, + { + 'internal_id': 5448, 'full_address': '4 Send Road, Send Road, Reading, RG4 8EP', 'postcode': 'RG4 8EP', + "Note": "Same as for 3 Send Road" + }, + { + 'internal_id': 5451, 'full_address': '7 Send Road, Send Road, Reading, RG4 8EP', 'postcode': 'RG4 8EP', + "Note": "Same as for 3 Send Road" + }, + { + 'internal_id': 4547, 'full_address': '2 Cecil Terrace, Bemerton, Salisbury, SP2 9NE', 'postcode': 'SP2 9NE', + "Note": "Addresses for this postcode are for The Croft, SP2 9NE. Should this be 2 Cecil Terrace SP2 9ND, with" + "uprn: 100121039798 ?" + }, + { + 'internal_id': 4549, 'full_address': '4 Cecil Terrace, Bemerton, Salisbury, SP2 9NE', 'postcode': 'SP2 9NE', + "Note": "Addresses for this postcode are for The Croft, SP2 9NE. Should this be 4 Cecil Terrace SP2 9ND?" + }, + { + 'internal_id': 3601, 'full_address': '20 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX', + "Note": "Should this be 20 Constitution Hill Gardens, Poole, BH14 0PY? (i.e. postcode is wrong) " + "uprn: 10001086693" + }, + { + 'internal_id': 3592, 'full_address': '7 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX', + "Note": "Should the postcode be BH14 0PY ?" + }, + { + 'internal_id': 3594, 'full_address': '9 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX', + "Note": "Should the postcode be BH14 0PY ?" + }, + { + 'internal_id': 3591, 'full_address': '6 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX', + "Note": "Should the postcode be BH14 0PY ?" + }, + { + 'internal_id': 3593, 'full_address': '8 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX', + "Note": "Should the postcode be BH14 0PY ?"}, + { + 'internal_id': 3590, 'full_address': '5 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX', + "Note": "Should the postcode be BH14 0PY ?"}, + { + 'internal_id': 3589, 'full_address': '3 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX', + "Note": "Should the postcode be BH14 0PY ?"}, + { + 'internal_id': 3600, 'full_address': '18 Constitution Hill, Parkstone, Poole, BH14 0PX', + 'postcode': 'BH14 0PX', "Note": "Should the postcode be BH14 0PY ?"}, + { + 'internal_id': 3599, 'full_address': '17 Constitution Hill, Parkstone, Poole, BH14 0PX', + 'postcode': 'BH14 0PX', "Note": "Should the postcode be BH14 0PY ?"}, + {'internal_id': 3598, 'full_address': '15 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX', + "Note": "Should the postcode be BH14 0PY ?"}, + {'internal_id': 3608, 'full_address': '26 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX', + "Note": "Should the postcode be BH14 0PY ?"}, + {'internal_id': 3610, 'full_address': '30 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX', + "Note": "Should the postcode be BH14 0PY ?"}, + {'internal_id': 3603, 'full_address': '22 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX', + "Note": "Should the postcode be BH14 0PY ?"}, + {'internal_id': 3612, 'full_address': '32 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX', + "Note": "Should the postcode be BH14 0PY ?"}, + {'internal_id': 3595, 'full_address': '10 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX', + "Note": "Should the postcode be BH14 0PY ?"}, + {'internal_id': 3613, 'full_address': '34 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX', + "Note": "Should the postcode be BH14 0PY ?"}, + + {'internal_id': 3597, 'full_address': '12 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX', + "Note": "Should the postcode be BH14 0QB ?"}, + {'internal_id': 3602, 'full_address': '21 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX', + "Note": "Should the postcode be BH14 0QB ?"}, + {'internal_id': 3606, 'full_address': '19 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX', + "Note": "Should the postcode be BH14 0QB ?"}, + {'internal_id': 3604, 'full_address': '23 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX', + "Note": "Should the postcode be BH14 0QB ?"}, + {'internal_id': 3605, 'full_address': '25 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX', + "Note": "Should the postcode be BH14 0QB ?"}, + {'internal_id': 3609, 'full_address': '29 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX', + "Note": "Should the postcode be BH14 0QB ?"}, + {'internal_id': 3596, 'full_address': '11 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX', + "Note": "Should the postcode be BH14 0QB ?"}, + {'internal_id': 3607, 'full_address': '27 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX', + "Note": "Should the postcode be BH14 0QB ?"}, + {'internal_id': 3611, 'full_address': '31 Constitution Hill, Parkstone, Poole, BH14 0PX', 'postcode': 'BH14 0PX', + "Note": "Should the postcode be BH14 0QB ?"}, + {'internal_id': 5622, 'full_address': '26 Roman Way, Andover, SP10 5HZ', 'postcode': 'SP10 5HZ', + 'Note': 'Shoul this postcode be SP10 5JU ?'} +] diff --git a/etl/customers/stonewater/shdf_3_clustering.py b/etl/customers/stonewater/shdf_3_clustering.py index 8a3725b9..f2ef9a8b 100644 --- a/etl/customers/stonewater/shdf_3_clustering.py +++ b/etl/customers/stonewater/shdf_3_clustering.py @@ -433,28 +433,28 @@ def app(): problematic_errors.append(row["internal_id"]) # Store to S3 - save_data_to_s3( - data=json.dumps(problematic_os), - s3_file_name="customers/Stonewater/clustering/problematic_os.json", - bucket_name="retrofit-data-dev" - ) - - save_data_to_s3( - data=json.dumps(problematic_os_all), - s3_file_name="customers/Stonewater/clustering/problematic_os_all.json", - bucket_name="retrofit-data-dev" - ) - - save_data_to_s3( - data=json.dumps(problematic_errors), - s3_file_name="customers/Stonewater/clustering/problematic_errors.json", - bucket_name="retrofit-data-dev" - ) + # save_data_to_s3( + # data=json.dumps(problematic_os), + # s3_file_name="customers/Stonewater/clustering/problematic_os.json", + # bucket_name="retrofit-data-dev" + # ) + # + # save_data_to_s3( + # data=json.dumps(problematic_os_all), + # s3_file_name="customers/Stonewater/clustering/problematic_os_all.json", + # bucket_name="retrofit-data-dev" + # ) + # + # save_data_to_s3( + # data=json.dumps(problematic_errors), + # s3_file_name="customers/Stonewater/clustering/problematic_errors.json", + # bucket_name="retrofit-data-dev" + # ) # Next steps: We should collate all of the data and produce 1 big dataset problematic_os_df = pd.DataFrame(problematic_os) - problematic_address_comparison = problematic[["internal_id", "full_address", "postcode"]].merge( + problematic_address_comparison = problematic[["internal_id", "full_address", "postcode", "house_number"]].merge( problematic_os_df[["internal_id", "ADDRESS", "POSTCODE", "UPRN"]], how="inner", on="internal_id" @@ -473,28 +473,50 @@ def app(): ), axis=1 ) - problematic_address_comparison = problematic_address_comparison.sort_values("match_similarity_score", - ascending=True) + problematic_address_comparison = problematic_address_comparison.sort_values( + "match_similarity_score", ascending=True + ) + + # let's do a house number extraction + problematic_address_comparison["extracted_house_number"] = problematic_address_comparison.apply( + lambda x: SearchEpc.get_house_number(x["ADDRESS"], x["OS_POSTCODE"]), axis=1 + ) + + problematic_address_comparison["house_numbers_different"] = ( + problematic_address_comparison["house_number"].str.lower().str.split(",").str[0].str.split(" ").str[0] != + problematic_address_comparison[ + "extracted_house_number"].str.lower() + ) # We perform a final check + # Take anything where the postcodes don't match, where the house numbers are different and the match similarity + # is less than 90, or the match similarity is less than 80 final_check = problematic_address_comparison[ - (problematic_address_comparison["match_similarity_score"] <= 90) | (~problematic_address_comparison["postcodes_match"]) - ] + ] + final_check = final_check.sort_values("match_similarity_score", ascending=False) + final_check = final_check.reset_index(drop=True) final_best_matches = [] + no_matches = [] for _, row in final_check.iterrows(): os_data = problematic_os_all[row["internal_id"]] os_data = pd.DataFrame( [x["DPA"] if "DPA" in x else x["LPI"] for x in os_data] ) - os_data["postcode"] = np.where( - ~pd.isnull(os_data["POSTCODE"]), - os_data["POSTCODE"], - os_data["POSTCODE_LOCATOR"] - ) + + if ("POSTCODE_LOCATOR" in os_data.columns) and ("POSTCODE" in os_data.columns): + os_data["postcode"] = np.where( + ~pd.isnull(os_data["POSTCODE"]), + os_data["POSTCODE"], + os_data["POSTCODE_LOCATOR"] + ) + elif "POSTCODE" in os_data.columns: + os_data["postcode"] = os_data["POSTCODE"] + else: + os_data["postcode"] = os_data["POSTCODE_LOCATOR"] os_data = os_data[os_data["postcode"].str.lower() == row["postcode"].lower()] - if os_data.shape[0] == 1: + if os_data.shape[0] >= 1: final_best_matches.append( { "internal_id": row["internal_id"], @@ -502,4 +524,72 @@ def app(): } ) else: - blah + no_matches.append( + { + "internal_id": row["internal_id"], + "full_address": row["full_address"], + "postcode": row["postcode"] + } + ) + + no_matches = pd.DataFrame(no_matches) + + # Data to be confirmed + from etl.customers.stonewater.no_matches import no_matches + no_matches_to_export = pd.DataFrame(no_matches) + no_matches_to_export = asset_list.merge( + no_matches_to_export[["internal_id", "Note"]], + how="inner", + on="internal_id" + ).rename( + columns={ + "internal_id": "Osm. ID", + "customer_asset_id": "Org. ref.", + "external_address_id": "Address ID", + } + ) + no_matches_to_export.to_excel("Stonewater - addresses with no matches.xlsx", index=False) + + # We also confirm final_best_matches + final_best_matches_df = pd.DataFrame(final_best_matches)[ + ["internal_id", "ADDRESS", "UPRN"] + ].rename( + columns={ + "ADDRESS": "Ordnance Survey Address - same postcode (best match)", + "UPRN": "UPRN - same postcode (best match)" + } + ) + # We also get their original match + final_best_matches_df = final_best_matches_df.merge( + problematic[["internal_id", "ADDRESS", "UPRN"]].rename( + columns={ + "ADDRESS": "Ordnance Survey Address - best possible match", + "UPRN": "UPRN - best possible match" + } + ), + how="inner", + on="internal_id" + ) + + # merge on the original data + final_best_matches_df = asset_list.merge( + final_best_matches_df, + how="inner", + on="internal_id" + ).rename( + columns={ + "internal_id": "Osm. ID", + "customer_asset_id": "Org. ref.", + "external_address_id": "Address ID", + } + ) + + # "Osm. ID": "internal_id", + # "Org. ref.": "customer_asset_id", + # "Postcode": "postcode", + # "House no": "house_number", + # "Name": "address1", + # "Address line 2": "address2", + # "City/Town": "city_town", + # "County": "county", + # "Address ID": "external_address_id",