Added postcode filter back to os api

2026-06-08 11:17:27 +00:00 · 2024-06-08 07:51:17 +01:00 · 2024-06-08 07:51:17 +01:00 · 0c1ef69fba
commit 0c1ef69fba
parent 9e32b8bf74
2 changed files with 200 additions and 6 deletions
--- a/backend/OrdnanceSurvey.py
+++ b/backend/OrdnanceSurvey.py
@ -38,7 +38,11 @@ class OrdnanceSuveyClient:
            raise ValueError("No results found - run get_places_api first")
        self.address_os = self.most_relevant_result["ADDRESS"]
-        self.postcode_os = self.most_relevant_result["POSTCODE"]
+
        if "POSTCODE" in self.most_relevant_result:
            self.postcode_os = self.most_relevant_result["POSTCODE"]
        else:
            self.postcode_os = self.most_relevant_result["POSTCODE_LOCATOR"]
        # We strip out the postcode from the address as this is already stored separately
        self.address_os = self.address_os.replace(self.postcode_os, "").strip()
        # Remove trailing comma
@ -49,7 +53,7 @@ class OrdnanceSuveyClient:
        self.postcode_os = self.postcode_os.upper()
    @lru_cache(maxsize=128)
-    def get_places_api(self):
+    def get_places_api(self, filter_by_postcode=False):
        """
        This method is tasked with getting the places api from the Ordnance Survey.
        """
@ -58,16 +62,35 @@ class OrdnanceSuveyClient:
            raise ValueError("Ordnance Survey API key not specified")
        encoded_address_query = urllib.parse.quote(self.full_address)
-        url = (f"https://api.os.uk/search/places/v1/find?query={encoded_address_query}&key="
+
-               f"{self.api_key}")
+        url = (
            f"https://api.os.uk/search/places/v1/find?query={encoded_address_query}&dataset=DPA,LPI&matchprecision=10"
            f"&key={self.api_key}"
        )
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
-            results = data['results']
+            res = data["results"]
            if filter_by_postcode:
                results = []
                for r in res:
                    if "DPA" in r:
                        if r["DPA"]["POSTCODE"] == self.postcode:
                            results.append(r)
                    elif "LPI" in r:
                        if r["LPI"]["POSTCODE_LOCATOR"] == self.postcode:
                            results.append(r)
                    else:
                        raise ValueError("Could not find postcode in either DPA or LPI")
            else:
                results = res
            self.results = results
            # Extract some details about the best match
-            self.most_relevant_result = self.results[0]["DPA"]
+            self.most_relevant_result = self.results[0]["DPA"] if "DPA" in self.results[0] else self.results[0]["LPI"]
            self.parse_classification_code(self.most_relevant_result["CLASSIFICATION_CODE"])
            self.set_places_address()
@ -99,6 +122,9 @@ class OrdnanceSuveyClient:
            'RD04': {'property_type': 'House', 'built_form': 'Mid-Terrace'},
            'RD06': {'property_type': 'Flat'},
        }
        # Other classifications can be found in here:
        # https://osdatahub.os.uk/docs/places/technicalSpecification in the CLASSIFICATION_CODE description.
        # A lookup table csv can be downloaded which contains all of the codes
        mapped = value_map.get(classification_code, {})
        self.property_type = mapped.get("property_type", "")
--- a/etl/customers/stonewater/shdf_3_clustering.py
+++ b/etl/customers/stonewater/shdf_3_clustering.py
@ -3,6 +3,8 @@ from tqdm import tqdm
 import os
 from dotenv import load_dotenv
 from backend.SearchEpc import SearchEpc
 import urllib.parse
 import requests
 from fuzzywuzzy import fuzz
 import numpy as np
@ -334,4 +336,170 @@ def app():
    # 4) Retrieveing additional data against the internal_id
    # 5) Creation of final dataset for clustering
    os_most_relevant = []
    os_all = {}
    for i in ["1", "2", "3"]:
        most_relevant_segment = read_from_s3(
            bucket_name="retrofit-data-dev",
            s3_file_name=f"customers/Stonewater/clustering/os_most_relevant_{i}.json"
        )
        os_most_relevant.extend(json.loads(most_relevant_segment))
        os_all_segment = read_from_s3(
            bucket_name="retrofit-data-dev",
            s3_file_name=f"customers/Stonewater/clustering/os_all_{i}.json"
        )
        os_all = {**os_all, **json.loads(os_all_segment)}
    os_most_relevant = pd.DataFrame(os_most_relevant)
    os_address_comparison = os_data_pull_asset_list[
        ["internal_id", "full_address", "postcode", "house_number", "address1"]
    ].merge(
        os_most_relevant[["internal_id", "ADDRESS", "POSTCODE", "UPRN"]],
        how="inner",
        on="internal_id"
    )
    # Compare house number
    # Check for records where the postcode doesn't match
    os_address_comparison["postcodes_match"] = (
        os_address_comparison["postcode"].str.lower() == os_address_comparison["POSTCODE"].str.lower()
    )
    # extract it from ADDRESS
    os_address_comparison["extracted_house_number"] = os_address_comparison["ADDRESS"].apply(
        lambda x: SearchEpc.get_house_number(x)
    )
    # Compare house number
    os_address_comparison["house_numbers_match"] = (
        os_address_comparison["house_number"].str.lower() == os_address_comparison["extracted_house_number"].str.lower()
    )
    # String similarity
    os_address_comparison["address_similarity_score"] = os_address_comparison.apply(
        lambda x: fuzz.ratio(
            remove_commas_and_full_stops(x["full_address"].lower()),
            remove_commas_and_full_stops(x["ADDRESS"].lower())
        ),
        axis=1
    )
    os_address_comparison = os_address_comparison.sort_values("address_similarity_score", ascending=True)
    problematic = os_address_comparison.copy()
    problematic = problematic[
        (problematic["address_similarity_score"] <= 80) |
        (~problematic["house_numbers_match"]) |
        (~problematic["postcodes_match"])
        ]
    # TODO: We'll label these problematic records as problematic, in the final output
    # different_postcodes = problematic[~problematic["postcodes_match"]].copy().reset_index(drop=True)
    ORDNANCE_SURVEY_API_KEY = ""  # This API key is a temp key which I have copied locally
    problematic_os = []
    problematic_os_all = {}
    problematic_errors = []
    for _, row in tqdm(problematic.iterrows(), total=len(problematic)):
        # Let's just do a backup pull - we're now using LPI too
        time.sleep(2)
        backup_searher = SearchEpc(
            address1=row["address1"],
            postcode=row["postcode"],
            auth_token=EPC_AUTH_TOKEN,
            os_api_key=ORDNANCE_SURVEY_API_KEY,
            uprn=None,
        )
        # Attempt to get places data with retry logic
        result = get_places_with_retry(backup_searher)
        if result:
            # Get the most relevant response
            problematic_os.append(
                {
                    "internal_id": row["internal_id"],
                    **backup_searher.ordnance_survey_client.most_relevant_result
                }
            )
            # Also keep the best 100 results
            problematic_os_all[row["internal_id"]] = backup_searher.ordnance_survey_client.results
        else:
            # Record the internal_id of the asset that failed
            print("Error for address: " + row["full_address"])
            problematic_errors.append(row["internal_id"])
    # Store to S3
    save_data_to_s3(
        data=json.dumps(problematic_os),
        s3_file_name="customers/Stonewater/clustering/problematic_os.json",
        bucket_name="retrofit-data-dev"
    )
    save_data_to_s3(
        data=json.dumps(problematic_os_all),
        s3_file_name="customers/Stonewater/clustering/problematic_os_all.json",
        bucket_name="retrofit-data-dev"
    )
    save_data_to_s3(
        data=json.dumps(problematic_errors),
        s3_file_name="customers/Stonewater/clustering/problematic_errors.json",
        bucket_name="retrofit-data-dev"
    )
    # Next steps: We should collate all of the data and produce 1 big dataset
    problematic_os_df = pd.DataFrame(problematic_os)
    problematic_address_comparison = problematic[["internal_id", "full_address", "postcode"]].merge(
        problematic_os_df[["internal_id", "ADDRESS", "POSTCODE", "UPRN"]],
        how="inner",
        on="internal_id"
    )
    problematic_address_comparison["OS_POSTCODE"] = problematic_address_comparison["ADDRESS"].str.split(", ").str[-1]
    problematic_address_comparison["postcodes_match"] = (
        problematic_address_comparison["postcode"].str.lower() == problematic_address_comparison[
        "OS_POSTCODE"].str.lower()
    )
    problematic_address_comparison["match_similarity_score"] = problematic_address_comparison.apply(
        lambda x: fuzz.ratio(
            remove_commas_and_full_stops(x["full_address"].lower()),
            remove_commas_and_full_stops(x["ADDRESS"].lower())
        ),
        axis=1
    )
    problematic_address_comparison = problematic_address_comparison.sort_values("match_similarity_score",
                                                                                ascending=True)
    # We perform a final check
    final_check = problematic_address_comparison[
        (problematic_address_comparison["match_similarity_score"] <= 90) |
        (~problematic_address_comparison["postcodes_match"])
        ]
    final_best_matches = []
    for _, row in final_check.iterrows():
        os_data = problematic_os_all[row["internal_id"]]
        os_data = pd.DataFrame(
            [x["DPA"] if "DPA" in x else x["LPI"] for x in os_data]
        )
        os_data["postcode"] = np.where(
            ~pd.isnull(os_data["POSTCODE"]),
            os_data["POSTCODE"],
            os_data["POSTCODE_LOCATOR"]
        )
        os_data = os_data[os_data["postcode"].str.lower() == row["postcode"].lower()]
        if os_data.shape[0] == 1:
            final_best_matches.append(
                {
                    "internal_id": row["internal_id"],
                    **os_data.iloc[0].to_dict()
                }
            )
        else:
            blah