mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Added postcode filter back to os api
This commit is contained in:
parent
9e32b8bf74
commit
0c1ef69fba
2 changed files with 200 additions and 6 deletions
|
|
@ -38,7 +38,11 @@ class OrdnanceSuveyClient:
|
|||
raise ValueError("No results found - run get_places_api first")
|
||||
|
||||
self.address_os = self.most_relevant_result["ADDRESS"]
|
||||
self.postcode_os = self.most_relevant_result["POSTCODE"]
|
||||
|
||||
if "POSTCODE" in self.most_relevant_result:
|
||||
self.postcode_os = self.most_relevant_result["POSTCODE"]
|
||||
else:
|
||||
self.postcode_os = self.most_relevant_result["POSTCODE_LOCATOR"]
|
||||
# We strip out the postcode from the address as this is already stored separately
|
||||
self.address_os = self.address_os.replace(self.postcode_os, "").strip()
|
||||
# Remove trailing comma
|
||||
|
|
@ -49,7 +53,7 @@ class OrdnanceSuveyClient:
|
|||
self.postcode_os = self.postcode_os.upper()
|
||||
|
||||
@lru_cache(maxsize=128)
|
||||
def get_places_api(self):
|
||||
def get_places_api(self, filter_by_postcode=False):
|
||||
"""
|
||||
This method is tasked with getting the places api from the Ordnance Survey.
|
||||
"""
|
||||
|
|
@ -58,16 +62,35 @@ class OrdnanceSuveyClient:
|
|||
raise ValueError("Ordnance Survey API key not specified")
|
||||
|
||||
encoded_address_query = urllib.parse.quote(self.full_address)
|
||||
url = (f"https://api.os.uk/search/places/v1/find?query={encoded_address_query}&key="
|
||||
f"{self.api_key}")
|
||||
|
||||
url = (
|
||||
f"https://api.os.uk/search/places/v1/find?query={encoded_address_query}&dataset=DPA,LPI&matchprecision=10"
|
||||
f"&key={self.api_key}"
|
||||
)
|
||||
|
||||
response = requests.get(url)
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
results = data['results']
|
||||
res = data["results"]
|
||||
|
||||
if filter_by_postcode:
|
||||
results = []
|
||||
for r in res:
|
||||
if "DPA" in r:
|
||||
if r["DPA"]["POSTCODE"] == self.postcode:
|
||||
results.append(r)
|
||||
elif "LPI" in r:
|
||||
if r["LPI"]["POSTCODE_LOCATOR"] == self.postcode:
|
||||
results.append(r)
|
||||
else:
|
||||
raise ValueError("Could not find postcode in either DPA or LPI")
|
||||
else:
|
||||
results = res
|
||||
|
||||
self.results = results
|
||||
|
||||
# Extract some details about the best match
|
||||
self.most_relevant_result = self.results[0]["DPA"]
|
||||
self.most_relevant_result = self.results[0]["DPA"] if "DPA" in self.results[0] else self.results[0]["LPI"]
|
||||
|
||||
self.parse_classification_code(self.most_relevant_result["CLASSIFICATION_CODE"])
|
||||
self.set_places_address()
|
||||
|
|
@ -99,6 +122,9 @@ class OrdnanceSuveyClient:
|
|||
'RD04': {'property_type': 'House', 'built_form': 'Mid-Terrace'},
|
||||
'RD06': {'property_type': 'Flat'},
|
||||
}
|
||||
# Other classifications can be found in here:
|
||||
# https://osdatahub.os.uk/docs/places/technicalSpecification in the CLASSIFICATION_CODE description.
|
||||
# A lookup table csv can be downloaded which contains all of the codes
|
||||
|
||||
mapped = value_map.get(classification_code, {})
|
||||
self.property_type = mapped.get("property_type", "")
|
||||
|
|
|
|||
|
|
@ -3,6 +3,8 @@ from tqdm import tqdm
|
|||
import os
|
||||
from dotenv import load_dotenv
|
||||
from backend.SearchEpc import SearchEpc
|
||||
import urllib.parse
|
||||
import requests
|
||||
|
||||
from fuzzywuzzy import fuzz
|
||||
import numpy as np
|
||||
|
|
@ -334,4 +336,170 @@ def app():
|
|||
# 4) Retrieveing additional data against the internal_id
|
||||
# 5) Creation of final dataset for clustering
|
||||
|
||||
os_most_relevant = []
|
||||
os_all = {}
|
||||
for i in ["1", "2", "3"]:
|
||||
most_relevant_segment = read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name=f"customers/Stonewater/clustering/os_most_relevant_{i}.json"
|
||||
)
|
||||
os_most_relevant.extend(json.loads(most_relevant_segment))
|
||||
os_all_segment = read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name=f"customers/Stonewater/clustering/os_all_{i}.json"
|
||||
)
|
||||
os_all = {**os_all, **json.loads(os_all_segment)}
|
||||
|
||||
os_most_relevant = pd.DataFrame(os_most_relevant)
|
||||
|
||||
os_address_comparison = os_data_pull_asset_list[
|
||||
["internal_id", "full_address", "postcode", "house_number", "address1"]
|
||||
].merge(
|
||||
os_most_relevant[["internal_id", "ADDRESS", "POSTCODE", "UPRN"]],
|
||||
how="inner",
|
||||
on="internal_id"
|
||||
)
|
||||
|
||||
# Compare house number
|
||||
# Check for records where the postcode doesn't match
|
||||
os_address_comparison["postcodes_match"] = (
|
||||
os_address_comparison["postcode"].str.lower() == os_address_comparison["POSTCODE"].str.lower()
|
||||
)
|
||||
|
||||
# extract it from ADDRESS
|
||||
os_address_comparison["extracted_house_number"] = os_address_comparison["ADDRESS"].apply(
|
||||
lambda x: SearchEpc.get_house_number(x)
|
||||
)
|
||||
|
||||
# Compare house number
|
||||
os_address_comparison["house_numbers_match"] = (
|
||||
os_address_comparison["house_number"].str.lower() == os_address_comparison["extracted_house_number"].str.lower()
|
||||
)
|
||||
|
||||
# String similarity
|
||||
os_address_comparison["address_similarity_score"] = os_address_comparison.apply(
|
||||
lambda x: fuzz.ratio(
|
||||
remove_commas_and_full_stops(x["full_address"].lower()),
|
||||
remove_commas_and_full_stops(x["ADDRESS"].lower())
|
||||
),
|
||||
axis=1
|
||||
)
|
||||
|
||||
os_address_comparison = os_address_comparison.sort_values("address_similarity_score", ascending=True)
|
||||
|
||||
problematic = os_address_comparison.copy()
|
||||
|
||||
problematic = problematic[
|
||||
(problematic["address_similarity_score"] <= 80) |
|
||||
(~problematic["house_numbers_match"]) |
|
||||
(~problematic["postcodes_match"])
|
||||
]
|
||||
|
||||
# TODO: We'll label these problematic records as problematic, in the final output
|
||||
|
||||
# different_postcodes = problematic[~problematic["postcodes_match"]].copy().reset_index(drop=True)
|
||||
|
||||
ORDNANCE_SURVEY_API_KEY = "" # This API key is a temp key which I have copied locally
|
||||
problematic_os = []
|
||||
problematic_os_all = {}
|
||||
problematic_errors = []
|
||||
for _, row in tqdm(problematic.iterrows(), total=len(problematic)):
|
||||
# Let's just do a backup pull - we're now using LPI too
|
||||
time.sleep(2)
|
||||
backup_searher = SearchEpc(
|
||||
address1=row["address1"],
|
||||
postcode=row["postcode"],
|
||||
auth_token=EPC_AUTH_TOKEN,
|
||||
os_api_key=ORDNANCE_SURVEY_API_KEY,
|
||||
uprn=None,
|
||||
)
|
||||
# Attempt to get places data with retry logic
|
||||
result = get_places_with_retry(backup_searher)
|
||||
|
||||
if result:
|
||||
# Get the most relevant response
|
||||
problematic_os.append(
|
||||
{
|
||||
"internal_id": row["internal_id"],
|
||||
**backup_searher.ordnance_survey_client.most_relevant_result
|
||||
}
|
||||
)
|
||||
|
||||
# Also keep the best 100 results
|
||||
problematic_os_all[row["internal_id"]] = backup_searher.ordnance_survey_client.results
|
||||
else:
|
||||
# Record the internal_id of the asset that failed
|
||||
print("Error for address: " + row["full_address"])
|
||||
problematic_errors.append(row["internal_id"])
|
||||
|
||||
# Store to S3
|
||||
save_data_to_s3(
|
||||
data=json.dumps(problematic_os),
|
||||
s3_file_name="customers/Stonewater/clustering/problematic_os.json",
|
||||
bucket_name="retrofit-data-dev"
|
||||
)
|
||||
|
||||
save_data_to_s3(
|
||||
data=json.dumps(problematic_os_all),
|
||||
s3_file_name="customers/Stonewater/clustering/problematic_os_all.json",
|
||||
bucket_name="retrofit-data-dev"
|
||||
)
|
||||
|
||||
save_data_to_s3(
|
||||
data=json.dumps(problematic_errors),
|
||||
s3_file_name="customers/Stonewater/clustering/problematic_errors.json",
|
||||
bucket_name="retrofit-data-dev"
|
||||
)
|
||||
|
||||
# Next steps: We should collate all of the data and produce 1 big dataset
|
||||
|
||||
problematic_os_df = pd.DataFrame(problematic_os)
|
||||
problematic_address_comparison = problematic[["internal_id", "full_address", "postcode"]].merge(
|
||||
problematic_os_df[["internal_id", "ADDRESS", "POSTCODE", "UPRN"]],
|
||||
how="inner",
|
||||
on="internal_id"
|
||||
)
|
||||
|
||||
problematic_address_comparison["OS_POSTCODE"] = problematic_address_comparison["ADDRESS"].str.split(", ").str[-1]
|
||||
problematic_address_comparison["postcodes_match"] = (
|
||||
problematic_address_comparison["postcode"].str.lower() == problematic_address_comparison[
|
||||
"OS_POSTCODE"].str.lower()
|
||||
)
|
||||
|
||||
problematic_address_comparison["match_similarity_score"] = problematic_address_comparison.apply(
|
||||
lambda x: fuzz.ratio(
|
||||
remove_commas_and_full_stops(x["full_address"].lower()),
|
||||
remove_commas_and_full_stops(x["ADDRESS"].lower())
|
||||
),
|
||||
axis=1
|
||||
)
|
||||
problematic_address_comparison = problematic_address_comparison.sort_values("match_similarity_score",
|
||||
ascending=True)
|
||||
|
||||
# We perform a final check
|
||||
final_check = problematic_address_comparison[
|
||||
(problematic_address_comparison["match_similarity_score"] <= 90) |
|
||||
(~problematic_address_comparison["postcodes_match"])
|
||||
]
|
||||
|
||||
final_best_matches = []
|
||||
for _, row in final_check.iterrows():
|
||||
os_data = problematic_os_all[row["internal_id"]]
|
||||
os_data = pd.DataFrame(
|
||||
[x["DPA"] if "DPA" in x else x["LPI"] for x in os_data]
|
||||
)
|
||||
os_data["postcode"] = np.where(
|
||||
~pd.isnull(os_data["POSTCODE"]),
|
||||
os_data["POSTCODE"],
|
||||
os_data["POSTCODE_LOCATOR"]
|
||||
)
|
||||
os_data = os_data[os_data["postcode"].str.lower() == row["postcode"].lower()]
|
||||
if os_data.shape[0] == 1:
|
||||
final_best_matches.append(
|
||||
{
|
||||
"internal_id": row["internal_id"],
|
||||
**os_data.iloc[0].to_dict()
|
||||
}
|
||||
)
|
||||
else:
|
||||
blah
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue