From 0c1ef69fba8a099386835960dbe3ab53351ef331 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 8 Jun 2024 07:51:17 +0100 Subject: [PATCH] Added postcode filter back to os api --- backend/OrdnanceSurvey.py | 38 +++- etl/customers/stonewater/shdf_3_clustering.py | 168 ++++++++++++++++++ 2 files changed, 200 insertions(+), 6 deletions(-) diff --git a/backend/OrdnanceSurvey.py b/backend/OrdnanceSurvey.py index 837e76bd..856dda7a 100644 --- a/backend/OrdnanceSurvey.py +++ b/backend/OrdnanceSurvey.py @@ -38,7 +38,11 @@ class OrdnanceSuveyClient: raise ValueError("No results found - run get_places_api first") self.address_os = self.most_relevant_result["ADDRESS"] - self.postcode_os = self.most_relevant_result["POSTCODE"] + + if "POSTCODE" in self.most_relevant_result: + self.postcode_os = self.most_relevant_result["POSTCODE"] + else: + self.postcode_os = self.most_relevant_result["POSTCODE_LOCATOR"] # We strip out the postcode from the address as this is already stored separately self.address_os = self.address_os.replace(self.postcode_os, "").strip() # Remove trailing comma @@ -49,7 +53,7 @@ class OrdnanceSuveyClient: self.postcode_os = self.postcode_os.upper() @lru_cache(maxsize=128) - def get_places_api(self): + def get_places_api(self, filter_by_postcode=False): """ This method is tasked with getting the places api from the Ordnance Survey. """ @@ -58,16 +62,35 @@ class OrdnanceSuveyClient: raise ValueError("Ordnance Survey API key not specified") encoded_address_query = urllib.parse.quote(self.full_address) - url = (f"https://api.os.uk/search/places/v1/find?query={encoded_address_query}&key=" - f"{self.api_key}") + + url = ( + f"https://api.os.uk/search/places/v1/find?query={encoded_address_query}&dataset=DPA,LPI&matchprecision=10" + f"&key={self.api_key}" + ) + response = requests.get(url) if response.status_code == 200: data = response.json() - results = data['results'] + res = data["results"] + + if filter_by_postcode: + results = [] + for r in res: + if "DPA" in r: + if r["DPA"]["POSTCODE"] == self.postcode: + results.append(r) + elif "LPI" in r: + if r["LPI"]["POSTCODE_LOCATOR"] == self.postcode: + results.append(r) + else: + raise ValueError("Could not find postcode in either DPA or LPI") + else: + results = res + self.results = results # Extract some details about the best match - self.most_relevant_result = self.results[0]["DPA"] + self.most_relevant_result = self.results[0]["DPA"] if "DPA" in self.results[0] else self.results[0]["LPI"] self.parse_classification_code(self.most_relevant_result["CLASSIFICATION_CODE"]) self.set_places_address() @@ -99,6 +122,9 @@ class OrdnanceSuveyClient: 'RD04': {'property_type': 'House', 'built_form': 'Mid-Terrace'}, 'RD06': {'property_type': 'Flat'}, } + # Other classifications can be found in here: + # https://osdatahub.os.uk/docs/places/technicalSpecification in the CLASSIFICATION_CODE description. + # A lookup table csv can be downloaded which contains all of the codes mapped = value_map.get(classification_code, {}) self.property_type = mapped.get("property_type", "") diff --git a/etl/customers/stonewater/shdf_3_clustering.py b/etl/customers/stonewater/shdf_3_clustering.py index 45b435ed..8a3725b9 100644 --- a/etl/customers/stonewater/shdf_3_clustering.py +++ b/etl/customers/stonewater/shdf_3_clustering.py @@ -3,6 +3,8 @@ from tqdm import tqdm import os from dotenv import load_dotenv from backend.SearchEpc import SearchEpc +import urllib.parse +import requests from fuzzywuzzy import fuzz import numpy as np @@ -334,4 +336,170 @@ def app(): # 4) Retrieveing additional data against the internal_id # 5) Creation of final dataset for clustering + os_most_relevant = [] + os_all = {} for i in ["1", "2", "3"]: + most_relevant_segment = read_from_s3( + bucket_name="retrofit-data-dev", + s3_file_name=f"customers/Stonewater/clustering/os_most_relevant_{i}.json" + ) + os_most_relevant.extend(json.loads(most_relevant_segment)) + os_all_segment = read_from_s3( + bucket_name="retrofit-data-dev", + s3_file_name=f"customers/Stonewater/clustering/os_all_{i}.json" + ) + os_all = {**os_all, **json.loads(os_all_segment)} + + os_most_relevant = pd.DataFrame(os_most_relevant) + + os_address_comparison = os_data_pull_asset_list[ + ["internal_id", "full_address", "postcode", "house_number", "address1"] + ].merge( + os_most_relevant[["internal_id", "ADDRESS", "POSTCODE", "UPRN"]], + how="inner", + on="internal_id" + ) + + # Compare house number + # Check for records where the postcode doesn't match + os_address_comparison["postcodes_match"] = ( + os_address_comparison["postcode"].str.lower() == os_address_comparison["POSTCODE"].str.lower() + ) + + # extract it from ADDRESS + os_address_comparison["extracted_house_number"] = os_address_comparison["ADDRESS"].apply( + lambda x: SearchEpc.get_house_number(x) + ) + + # Compare house number + os_address_comparison["house_numbers_match"] = ( + os_address_comparison["house_number"].str.lower() == os_address_comparison["extracted_house_number"].str.lower() + ) + + # String similarity + os_address_comparison["address_similarity_score"] = os_address_comparison.apply( + lambda x: fuzz.ratio( + remove_commas_and_full_stops(x["full_address"].lower()), + remove_commas_and_full_stops(x["ADDRESS"].lower()) + ), + axis=1 + ) + + os_address_comparison = os_address_comparison.sort_values("address_similarity_score", ascending=True) + + problematic = os_address_comparison.copy() + + problematic = problematic[ + (problematic["address_similarity_score"] <= 80) | + (~problematic["house_numbers_match"]) | + (~problematic["postcodes_match"]) + ] + + # TODO: We'll label these problematic records as problematic, in the final output + + # different_postcodes = problematic[~problematic["postcodes_match"]].copy().reset_index(drop=True) + + ORDNANCE_SURVEY_API_KEY = "" # This API key is a temp key which I have copied locally + problematic_os = [] + problematic_os_all = {} + problematic_errors = [] + for _, row in tqdm(problematic.iterrows(), total=len(problematic)): + # Let's just do a backup pull - we're now using LPI too + time.sleep(2) + backup_searher = SearchEpc( + address1=row["address1"], + postcode=row["postcode"], + auth_token=EPC_AUTH_TOKEN, + os_api_key=ORDNANCE_SURVEY_API_KEY, + uprn=None, + ) + # Attempt to get places data with retry logic + result = get_places_with_retry(backup_searher) + + if result: + # Get the most relevant response + problematic_os.append( + { + "internal_id": row["internal_id"], + **backup_searher.ordnance_survey_client.most_relevant_result + } + ) + + # Also keep the best 100 results + problematic_os_all[row["internal_id"]] = backup_searher.ordnance_survey_client.results + else: + # Record the internal_id of the asset that failed + print("Error for address: " + row["full_address"]) + problematic_errors.append(row["internal_id"]) + + # Store to S3 + save_data_to_s3( + data=json.dumps(problematic_os), + s3_file_name="customers/Stonewater/clustering/problematic_os.json", + bucket_name="retrofit-data-dev" + ) + + save_data_to_s3( + data=json.dumps(problematic_os_all), + s3_file_name="customers/Stonewater/clustering/problematic_os_all.json", + bucket_name="retrofit-data-dev" + ) + + save_data_to_s3( + data=json.dumps(problematic_errors), + s3_file_name="customers/Stonewater/clustering/problematic_errors.json", + bucket_name="retrofit-data-dev" + ) + + # Next steps: We should collate all of the data and produce 1 big dataset + + problematic_os_df = pd.DataFrame(problematic_os) + problematic_address_comparison = problematic[["internal_id", "full_address", "postcode"]].merge( + problematic_os_df[["internal_id", "ADDRESS", "POSTCODE", "UPRN"]], + how="inner", + on="internal_id" + ) + + problematic_address_comparison["OS_POSTCODE"] = problematic_address_comparison["ADDRESS"].str.split(", ").str[-1] + problematic_address_comparison["postcodes_match"] = ( + problematic_address_comparison["postcode"].str.lower() == problematic_address_comparison[ + "OS_POSTCODE"].str.lower() + ) + + problematic_address_comparison["match_similarity_score"] = problematic_address_comparison.apply( + lambda x: fuzz.ratio( + remove_commas_and_full_stops(x["full_address"].lower()), + remove_commas_and_full_stops(x["ADDRESS"].lower()) + ), + axis=1 + ) + problematic_address_comparison = problematic_address_comparison.sort_values("match_similarity_score", + ascending=True) + + # We perform a final check + final_check = problematic_address_comparison[ + (problematic_address_comparison["match_similarity_score"] <= 90) | + (~problematic_address_comparison["postcodes_match"]) + ] + + final_best_matches = [] + for _, row in final_check.iterrows(): + os_data = problematic_os_all[row["internal_id"]] + os_data = pd.DataFrame( + [x["DPA"] if "DPA" in x else x["LPI"] for x in os_data] + ) + os_data["postcode"] = np.where( + ~pd.isnull(os_data["POSTCODE"]), + os_data["POSTCODE"], + os_data["POSTCODE_LOCATOR"] + ) + os_data = os_data[os_data["postcode"].str.lower() == row["postcode"].lower()] + if os_data.shape[0] == 1: + final_best_matches.append( + { + "internal_id": row["internal_id"], + **os_data.iloc[0].to_dict() + } + ) + else: + blah