mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Added postcode filter back to os api
This commit is contained in:
parent
9e32b8bf74
commit
0c1ef69fba
2 changed files with 200 additions and 6 deletions
|
|
@ -38,7 +38,11 @@ class OrdnanceSuveyClient:
|
||||||
raise ValueError("No results found - run get_places_api first")
|
raise ValueError("No results found - run get_places_api first")
|
||||||
|
|
||||||
self.address_os = self.most_relevant_result["ADDRESS"]
|
self.address_os = self.most_relevant_result["ADDRESS"]
|
||||||
self.postcode_os = self.most_relevant_result["POSTCODE"]
|
|
||||||
|
if "POSTCODE" in self.most_relevant_result:
|
||||||
|
self.postcode_os = self.most_relevant_result["POSTCODE"]
|
||||||
|
else:
|
||||||
|
self.postcode_os = self.most_relevant_result["POSTCODE_LOCATOR"]
|
||||||
# We strip out the postcode from the address as this is already stored separately
|
# We strip out the postcode from the address as this is already stored separately
|
||||||
self.address_os = self.address_os.replace(self.postcode_os, "").strip()
|
self.address_os = self.address_os.replace(self.postcode_os, "").strip()
|
||||||
# Remove trailing comma
|
# Remove trailing comma
|
||||||
|
|
@ -49,7 +53,7 @@ class OrdnanceSuveyClient:
|
||||||
self.postcode_os = self.postcode_os.upper()
|
self.postcode_os = self.postcode_os.upper()
|
||||||
|
|
||||||
@lru_cache(maxsize=128)
|
@lru_cache(maxsize=128)
|
||||||
def get_places_api(self):
|
def get_places_api(self, filter_by_postcode=False):
|
||||||
"""
|
"""
|
||||||
This method is tasked with getting the places api from the Ordnance Survey.
|
This method is tasked with getting the places api from the Ordnance Survey.
|
||||||
"""
|
"""
|
||||||
|
|
@ -58,16 +62,35 @@ class OrdnanceSuveyClient:
|
||||||
raise ValueError("Ordnance Survey API key not specified")
|
raise ValueError("Ordnance Survey API key not specified")
|
||||||
|
|
||||||
encoded_address_query = urllib.parse.quote(self.full_address)
|
encoded_address_query = urllib.parse.quote(self.full_address)
|
||||||
url = (f"https://api.os.uk/search/places/v1/find?query={encoded_address_query}&key="
|
|
||||||
f"{self.api_key}")
|
url = (
|
||||||
|
f"https://api.os.uk/search/places/v1/find?query={encoded_address_query}&dataset=DPA,LPI&matchprecision=10"
|
||||||
|
f"&key={self.api_key}"
|
||||||
|
)
|
||||||
|
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
data = response.json()
|
data = response.json()
|
||||||
results = data['results']
|
res = data["results"]
|
||||||
|
|
||||||
|
if filter_by_postcode:
|
||||||
|
results = []
|
||||||
|
for r in res:
|
||||||
|
if "DPA" in r:
|
||||||
|
if r["DPA"]["POSTCODE"] == self.postcode:
|
||||||
|
results.append(r)
|
||||||
|
elif "LPI" in r:
|
||||||
|
if r["LPI"]["POSTCODE_LOCATOR"] == self.postcode:
|
||||||
|
results.append(r)
|
||||||
|
else:
|
||||||
|
raise ValueError("Could not find postcode in either DPA or LPI")
|
||||||
|
else:
|
||||||
|
results = res
|
||||||
|
|
||||||
self.results = results
|
self.results = results
|
||||||
|
|
||||||
# Extract some details about the best match
|
# Extract some details about the best match
|
||||||
self.most_relevant_result = self.results[0]["DPA"]
|
self.most_relevant_result = self.results[0]["DPA"] if "DPA" in self.results[0] else self.results[0]["LPI"]
|
||||||
|
|
||||||
self.parse_classification_code(self.most_relevant_result["CLASSIFICATION_CODE"])
|
self.parse_classification_code(self.most_relevant_result["CLASSIFICATION_CODE"])
|
||||||
self.set_places_address()
|
self.set_places_address()
|
||||||
|
|
@ -99,6 +122,9 @@ class OrdnanceSuveyClient:
|
||||||
'RD04': {'property_type': 'House', 'built_form': 'Mid-Terrace'},
|
'RD04': {'property_type': 'House', 'built_form': 'Mid-Terrace'},
|
||||||
'RD06': {'property_type': 'Flat'},
|
'RD06': {'property_type': 'Flat'},
|
||||||
}
|
}
|
||||||
|
# Other classifications can be found in here:
|
||||||
|
# https://osdatahub.os.uk/docs/places/technicalSpecification in the CLASSIFICATION_CODE description.
|
||||||
|
# A lookup table csv can be downloaded which contains all of the codes
|
||||||
|
|
||||||
mapped = value_map.get(classification_code, {})
|
mapped = value_map.get(classification_code, {})
|
||||||
self.property_type = mapped.get("property_type", "")
|
self.property_type = mapped.get("property_type", "")
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,8 @@ from tqdm import tqdm
|
||||||
import os
|
import os
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from backend.SearchEpc import SearchEpc
|
from backend.SearchEpc import SearchEpc
|
||||||
|
import urllib.parse
|
||||||
|
import requests
|
||||||
|
|
||||||
from fuzzywuzzy import fuzz
|
from fuzzywuzzy import fuzz
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
@ -334,4 +336,170 @@ def app():
|
||||||
# 4) Retrieveing additional data against the internal_id
|
# 4) Retrieveing additional data against the internal_id
|
||||||
# 5) Creation of final dataset for clustering
|
# 5) Creation of final dataset for clustering
|
||||||
|
|
||||||
|
os_most_relevant = []
|
||||||
|
os_all = {}
|
||||||
for i in ["1", "2", "3"]:
|
for i in ["1", "2", "3"]:
|
||||||
|
most_relevant_segment = read_from_s3(
|
||||||
|
bucket_name="retrofit-data-dev",
|
||||||
|
s3_file_name=f"customers/Stonewater/clustering/os_most_relevant_{i}.json"
|
||||||
|
)
|
||||||
|
os_most_relevant.extend(json.loads(most_relevant_segment))
|
||||||
|
os_all_segment = read_from_s3(
|
||||||
|
bucket_name="retrofit-data-dev",
|
||||||
|
s3_file_name=f"customers/Stonewater/clustering/os_all_{i}.json"
|
||||||
|
)
|
||||||
|
os_all = {**os_all, **json.loads(os_all_segment)}
|
||||||
|
|
||||||
|
os_most_relevant = pd.DataFrame(os_most_relevant)
|
||||||
|
|
||||||
|
os_address_comparison = os_data_pull_asset_list[
|
||||||
|
["internal_id", "full_address", "postcode", "house_number", "address1"]
|
||||||
|
].merge(
|
||||||
|
os_most_relevant[["internal_id", "ADDRESS", "POSTCODE", "UPRN"]],
|
||||||
|
how="inner",
|
||||||
|
on="internal_id"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Compare house number
|
||||||
|
# Check for records where the postcode doesn't match
|
||||||
|
os_address_comparison["postcodes_match"] = (
|
||||||
|
os_address_comparison["postcode"].str.lower() == os_address_comparison["POSTCODE"].str.lower()
|
||||||
|
)
|
||||||
|
|
||||||
|
# extract it from ADDRESS
|
||||||
|
os_address_comparison["extracted_house_number"] = os_address_comparison["ADDRESS"].apply(
|
||||||
|
lambda x: SearchEpc.get_house_number(x)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Compare house number
|
||||||
|
os_address_comparison["house_numbers_match"] = (
|
||||||
|
os_address_comparison["house_number"].str.lower() == os_address_comparison["extracted_house_number"].str.lower()
|
||||||
|
)
|
||||||
|
|
||||||
|
# String similarity
|
||||||
|
os_address_comparison["address_similarity_score"] = os_address_comparison.apply(
|
||||||
|
lambda x: fuzz.ratio(
|
||||||
|
remove_commas_and_full_stops(x["full_address"].lower()),
|
||||||
|
remove_commas_and_full_stops(x["ADDRESS"].lower())
|
||||||
|
),
|
||||||
|
axis=1
|
||||||
|
)
|
||||||
|
|
||||||
|
os_address_comparison = os_address_comparison.sort_values("address_similarity_score", ascending=True)
|
||||||
|
|
||||||
|
problematic = os_address_comparison.copy()
|
||||||
|
|
||||||
|
problematic = problematic[
|
||||||
|
(problematic["address_similarity_score"] <= 80) |
|
||||||
|
(~problematic["house_numbers_match"]) |
|
||||||
|
(~problematic["postcodes_match"])
|
||||||
|
]
|
||||||
|
|
||||||
|
# TODO: We'll label these problematic records as problematic, in the final output
|
||||||
|
|
||||||
|
# different_postcodes = problematic[~problematic["postcodes_match"]].copy().reset_index(drop=True)
|
||||||
|
|
||||||
|
ORDNANCE_SURVEY_API_KEY = "" # This API key is a temp key which I have copied locally
|
||||||
|
problematic_os = []
|
||||||
|
problematic_os_all = {}
|
||||||
|
problematic_errors = []
|
||||||
|
for _, row in tqdm(problematic.iterrows(), total=len(problematic)):
|
||||||
|
# Let's just do a backup pull - we're now using LPI too
|
||||||
|
time.sleep(2)
|
||||||
|
backup_searher = SearchEpc(
|
||||||
|
address1=row["address1"],
|
||||||
|
postcode=row["postcode"],
|
||||||
|
auth_token=EPC_AUTH_TOKEN,
|
||||||
|
os_api_key=ORDNANCE_SURVEY_API_KEY,
|
||||||
|
uprn=None,
|
||||||
|
)
|
||||||
|
# Attempt to get places data with retry logic
|
||||||
|
result = get_places_with_retry(backup_searher)
|
||||||
|
|
||||||
|
if result:
|
||||||
|
# Get the most relevant response
|
||||||
|
problematic_os.append(
|
||||||
|
{
|
||||||
|
"internal_id": row["internal_id"],
|
||||||
|
**backup_searher.ordnance_survey_client.most_relevant_result
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Also keep the best 100 results
|
||||||
|
problematic_os_all[row["internal_id"]] = backup_searher.ordnance_survey_client.results
|
||||||
|
else:
|
||||||
|
# Record the internal_id of the asset that failed
|
||||||
|
print("Error for address: " + row["full_address"])
|
||||||
|
problematic_errors.append(row["internal_id"])
|
||||||
|
|
||||||
|
# Store to S3
|
||||||
|
save_data_to_s3(
|
||||||
|
data=json.dumps(problematic_os),
|
||||||
|
s3_file_name="customers/Stonewater/clustering/problematic_os.json",
|
||||||
|
bucket_name="retrofit-data-dev"
|
||||||
|
)
|
||||||
|
|
||||||
|
save_data_to_s3(
|
||||||
|
data=json.dumps(problematic_os_all),
|
||||||
|
s3_file_name="customers/Stonewater/clustering/problematic_os_all.json",
|
||||||
|
bucket_name="retrofit-data-dev"
|
||||||
|
)
|
||||||
|
|
||||||
|
save_data_to_s3(
|
||||||
|
data=json.dumps(problematic_errors),
|
||||||
|
s3_file_name="customers/Stonewater/clustering/problematic_errors.json",
|
||||||
|
bucket_name="retrofit-data-dev"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Next steps: We should collate all of the data and produce 1 big dataset
|
||||||
|
|
||||||
|
problematic_os_df = pd.DataFrame(problematic_os)
|
||||||
|
problematic_address_comparison = problematic[["internal_id", "full_address", "postcode"]].merge(
|
||||||
|
problematic_os_df[["internal_id", "ADDRESS", "POSTCODE", "UPRN"]],
|
||||||
|
how="inner",
|
||||||
|
on="internal_id"
|
||||||
|
)
|
||||||
|
|
||||||
|
problematic_address_comparison["OS_POSTCODE"] = problematic_address_comparison["ADDRESS"].str.split(", ").str[-1]
|
||||||
|
problematic_address_comparison["postcodes_match"] = (
|
||||||
|
problematic_address_comparison["postcode"].str.lower() == problematic_address_comparison[
|
||||||
|
"OS_POSTCODE"].str.lower()
|
||||||
|
)
|
||||||
|
|
||||||
|
problematic_address_comparison["match_similarity_score"] = problematic_address_comparison.apply(
|
||||||
|
lambda x: fuzz.ratio(
|
||||||
|
remove_commas_and_full_stops(x["full_address"].lower()),
|
||||||
|
remove_commas_and_full_stops(x["ADDRESS"].lower())
|
||||||
|
),
|
||||||
|
axis=1
|
||||||
|
)
|
||||||
|
problematic_address_comparison = problematic_address_comparison.sort_values("match_similarity_score",
|
||||||
|
ascending=True)
|
||||||
|
|
||||||
|
# We perform a final check
|
||||||
|
final_check = problematic_address_comparison[
|
||||||
|
(problematic_address_comparison["match_similarity_score"] <= 90) |
|
||||||
|
(~problematic_address_comparison["postcodes_match"])
|
||||||
|
]
|
||||||
|
|
||||||
|
final_best_matches = []
|
||||||
|
for _, row in final_check.iterrows():
|
||||||
|
os_data = problematic_os_all[row["internal_id"]]
|
||||||
|
os_data = pd.DataFrame(
|
||||||
|
[x["DPA"] if "DPA" in x else x["LPI"] for x in os_data]
|
||||||
|
)
|
||||||
|
os_data["postcode"] = np.where(
|
||||||
|
~pd.isnull(os_data["POSTCODE"]),
|
||||||
|
os_data["POSTCODE"],
|
||||||
|
os_data["POSTCODE_LOCATOR"]
|
||||||
|
)
|
||||||
|
os_data = os_data[os_data["postcode"].str.lower() == row["postcode"].lower()]
|
||||||
|
if os_data.shape[0] == 1:
|
||||||
|
final_best_matches.append(
|
||||||
|
{
|
||||||
|
"internal_id": row["internal_id"],
|
||||||
|
**os_data.iloc[0].to_dict()
|
||||||
|
}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
blah
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue