Added postcode filter back to os api

This commit is contained in:
Khalim Conn-Kowlessar 2024-06-08 07:51:17 +01:00
parent 9e32b8bf74
commit 0c1ef69fba
2 changed files with 200 additions and 6 deletions

View file

@ -38,7 +38,11 @@ class OrdnanceSuveyClient:
raise ValueError("No results found - run get_places_api first") raise ValueError("No results found - run get_places_api first")
self.address_os = self.most_relevant_result["ADDRESS"] self.address_os = self.most_relevant_result["ADDRESS"]
self.postcode_os = self.most_relevant_result["POSTCODE"]
if "POSTCODE" in self.most_relevant_result:
self.postcode_os = self.most_relevant_result["POSTCODE"]
else:
self.postcode_os = self.most_relevant_result["POSTCODE_LOCATOR"]
# We strip out the postcode from the address as this is already stored separately # We strip out the postcode from the address as this is already stored separately
self.address_os = self.address_os.replace(self.postcode_os, "").strip() self.address_os = self.address_os.replace(self.postcode_os, "").strip()
# Remove trailing comma # Remove trailing comma
@ -49,7 +53,7 @@ class OrdnanceSuveyClient:
self.postcode_os = self.postcode_os.upper() self.postcode_os = self.postcode_os.upper()
@lru_cache(maxsize=128) @lru_cache(maxsize=128)
def get_places_api(self): def get_places_api(self, filter_by_postcode=False):
""" """
This method is tasked with getting the places api from the Ordnance Survey. This method is tasked with getting the places api from the Ordnance Survey.
""" """
@ -58,16 +62,35 @@ class OrdnanceSuveyClient:
raise ValueError("Ordnance Survey API key not specified") raise ValueError("Ordnance Survey API key not specified")
encoded_address_query = urllib.parse.quote(self.full_address) encoded_address_query = urllib.parse.quote(self.full_address)
url = (f"https://api.os.uk/search/places/v1/find?query={encoded_address_query}&key="
f"{self.api_key}") url = (
f"https://api.os.uk/search/places/v1/find?query={encoded_address_query}&dataset=DPA,LPI&matchprecision=10"
f"&key={self.api_key}"
)
response = requests.get(url) response = requests.get(url)
if response.status_code == 200: if response.status_code == 200:
data = response.json() data = response.json()
results = data['results'] res = data["results"]
if filter_by_postcode:
results = []
for r in res:
if "DPA" in r:
if r["DPA"]["POSTCODE"] == self.postcode:
results.append(r)
elif "LPI" in r:
if r["LPI"]["POSTCODE_LOCATOR"] == self.postcode:
results.append(r)
else:
raise ValueError("Could not find postcode in either DPA or LPI")
else:
results = res
self.results = results self.results = results
# Extract some details about the best match # Extract some details about the best match
self.most_relevant_result = self.results[0]["DPA"] self.most_relevant_result = self.results[0]["DPA"] if "DPA" in self.results[0] else self.results[0]["LPI"]
self.parse_classification_code(self.most_relevant_result["CLASSIFICATION_CODE"]) self.parse_classification_code(self.most_relevant_result["CLASSIFICATION_CODE"])
self.set_places_address() self.set_places_address()
@ -99,6 +122,9 @@ class OrdnanceSuveyClient:
'RD04': {'property_type': 'House', 'built_form': 'Mid-Terrace'}, 'RD04': {'property_type': 'House', 'built_form': 'Mid-Terrace'},
'RD06': {'property_type': 'Flat'}, 'RD06': {'property_type': 'Flat'},
} }
# Other classifications can be found in here:
# https://osdatahub.os.uk/docs/places/technicalSpecification in the CLASSIFICATION_CODE description.
# A lookup table csv can be downloaded which contains all of the codes
mapped = value_map.get(classification_code, {}) mapped = value_map.get(classification_code, {})
self.property_type = mapped.get("property_type", "") self.property_type = mapped.get("property_type", "")

View file

@ -3,6 +3,8 @@ from tqdm import tqdm
import os import os
from dotenv import load_dotenv from dotenv import load_dotenv
from backend.SearchEpc import SearchEpc from backend.SearchEpc import SearchEpc
import urllib.parse
import requests
from fuzzywuzzy import fuzz from fuzzywuzzy import fuzz
import numpy as np import numpy as np
@ -334,4 +336,170 @@ def app():
# 4) Retrieveing additional data against the internal_id # 4) Retrieveing additional data against the internal_id
# 5) Creation of final dataset for clustering # 5) Creation of final dataset for clustering
os_most_relevant = []
os_all = {}
for i in ["1", "2", "3"]: for i in ["1", "2", "3"]:
most_relevant_segment = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name=f"customers/Stonewater/clustering/os_most_relevant_{i}.json"
)
os_most_relevant.extend(json.loads(most_relevant_segment))
os_all_segment = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name=f"customers/Stonewater/clustering/os_all_{i}.json"
)
os_all = {**os_all, **json.loads(os_all_segment)}
os_most_relevant = pd.DataFrame(os_most_relevant)
os_address_comparison = os_data_pull_asset_list[
["internal_id", "full_address", "postcode", "house_number", "address1"]
].merge(
os_most_relevant[["internal_id", "ADDRESS", "POSTCODE", "UPRN"]],
how="inner",
on="internal_id"
)
# Compare house number
# Check for records where the postcode doesn't match
os_address_comparison["postcodes_match"] = (
os_address_comparison["postcode"].str.lower() == os_address_comparison["POSTCODE"].str.lower()
)
# extract it from ADDRESS
os_address_comparison["extracted_house_number"] = os_address_comparison["ADDRESS"].apply(
lambda x: SearchEpc.get_house_number(x)
)
# Compare house number
os_address_comparison["house_numbers_match"] = (
os_address_comparison["house_number"].str.lower() == os_address_comparison["extracted_house_number"].str.lower()
)
# String similarity
os_address_comparison["address_similarity_score"] = os_address_comparison.apply(
lambda x: fuzz.ratio(
remove_commas_and_full_stops(x["full_address"].lower()),
remove_commas_and_full_stops(x["ADDRESS"].lower())
),
axis=1
)
os_address_comparison = os_address_comparison.sort_values("address_similarity_score", ascending=True)
problematic = os_address_comparison.copy()
problematic = problematic[
(problematic["address_similarity_score"] <= 80) |
(~problematic["house_numbers_match"]) |
(~problematic["postcodes_match"])
]
# TODO: We'll label these problematic records as problematic, in the final output
# different_postcodes = problematic[~problematic["postcodes_match"]].copy().reset_index(drop=True)
ORDNANCE_SURVEY_API_KEY = "" # This API key is a temp key which I have copied locally
problematic_os = []
problematic_os_all = {}
problematic_errors = []
for _, row in tqdm(problematic.iterrows(), total=len(problematic)):
# Let's just do a backup pull - we're now using LPI too
time.sleep(2)
backup_searher = SearchEpc(
address1=row["address1"],
postcode=row["postcode"],
auth_token=EPC_AUTH_TOKEN,
os_api_key=ORDNANCE_SURVEY_API_KEY,
uprn=None,
)
# Attempt to get places data with retry logic
result = get_places_with_retry(backup_searher)
if result:
# Get the most relevant response
problematic_os.append(
{
"internal_id": row["internal_id"],
**backup_searher.ordnance_survey_client.most_relevant_result
}
)
# Also keep the best 100 results
problematic_os_all[row["internal_id"]] = backup_searher.ordnance_survey_client.results
else:
# Record the internal_id of the asset that failed
print("Error for address: " + row["full_address"])
problematic_errors.append(row["internal_id"])
# Store to S3
save_data_to_s3(
data=json.dumps(problematic_os),
s3_file_name="customers/Stonewater/clustering/problematic_os.json",
bucket_name="retrofit-data-dev"
)
save_data_to_s3(
data=json.dumps(problematic_os_all),
s3_file_name="customers/Stonewater/clustering/problematic_os_all.json",
bucket_name="retrofit-data-dev"
)
save_data_to_s3(
data=json.dumps(problematic_errors),
s3_file_name="customers/Stonewater/clustering/problematic_errors.json",
bucket_name="retrofit-data-dev"
)
# Next steps: We should collate all of the data and produce 1 big dataset
problematic_os_df = pd.DataFrame(problematic_os)
problematic_address_comparison = problematic[["internal_id", "full_address", "postcode"]].merge(
problematic_os_df[["internal_id", "ADDRESS", "POSTCODE", "UPRN"]],
how="inner",
on="internal_id"
)
problematic_address_comparison["OS_POSTCODE"] = problematic_address_comparison["ADDRESS"].str.split(", ").str[-1]
problematic_address_comparison["postcodes_match"] = (
problematic_address_comparison["postcode"].str.lower() == problematic_address_comparison[
"OS_POSTCODE"].str.lower()
)
problematic_address_comparison["match_similarity_score"] = problematic_address_comparison.apply(
lambda x: fuzz.ratio(
remove_commas_and_full_stops(x["full_address"].lower()),
remove_commas_and_full_stops(x["ADDRESS"].lower())
),
axis=1
)
problematic_address_comparison = problematic_address_comparison.sort_values("match_similarity_score",
ascending=True)
# We perform a final check
final_check = problematic_address_comparison[
(problematic_address_comparison["match_similarity_score"] <= 90) |
(~problematic_address_comparison["postcodes_match"])
]
final_best_matches = []
for _, row in final_check.iterrows():
os_data = problematic_os_all[row["internal_id"]]
os_data = pd.DataFrame(
[x["DPA"] if "DPA" in x else x["LPI"] for x in os_data]
)
os_data["postcode"] = np.where(
~pd.isnull(os_data["POSTCODE"]),
os_data["POSTCODE"],
os_data["POSTCODE_LOCATOR"]
)
os_data = os_data[os_data["postcode"].str.lower() == row["postcode"].lower()]
if os_data.shape[0] == 1:
final_best_matches.append(
{
"internal_id": row["internal_id"],
**os_data.iloc[0].to_dict()
}
)
else:
blah