From 0c1ef69fba8a099386835960dbe3ab53351ef331 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sat, 8 Jun 2024 07:51:17 +0100
Subject: [PATCH] Added postcode filter back to os api

---
 backend/OrdnanceSurvey.py                     |  38 +++-
 etl/customers/stonewater/shdf_3_clustering.py | 168 ++++++++++++++++++
 2 files changed, 200 insertions(+), 6 deletions(-)

diff --git a/backend/OrdnanceSurvey.py b/backend/OrdnanceSurvey.py
index 837e76bd..856dda7a 100644
--- a/backend/OrdnanceSurvey.py
+++ b/backend/OrdnanceSurvey.py
@@ -38,7 +38,11 @@ class OrdnanceSuveyClient:
             raise ValueError("No results found - run get_places_api first")
 
         self.address_os = self.most_relevant_result["ADDRESS"]
-        self.postcode_os = self.most_relevant_result["POSTCODE"]
+
+        if "POSTCODE" in self.most_relevant_result:
+            self.postcode_os = self.most_relevant_result["POSTCODE"]
+        else:
+            self.postcode_os = self.most_relevant_result["POSTCODE_LOCATOR"]
         # We strip out the postcode from the address as this is already stored separately
         self.address_os = self.address_os.replace(self.postcode_os, "").strip()
         # Remove trailing comma
@@ -49,7 +53,7 @@ class OrdnanceSuveyClient:
         self.postcode_os = self.postcode_os.upper()
 
     @lru_cache(maxsize=128)
-    def get_places_api(self):
+    def get_places_api(self, filter_by_postcode=False):
         """
         This method is tasked with getting the places api from the Ordnance Survey.
         """
@@ -58,16 +62,35 @@ class OrdnanceSuveyClient:
             raise ValueError("Ordnance Survey API key not specified")
 
         encoded_address_query = urllib.parse.quote(self.full_address)
-        url = (f"https://api.os.uk/search/places/v1/find?query={encoded_address_query}&key="
-               f"{self.api_key}")
+
+        url = (
+            f"https://api.os.uk/search/places/v1/find?query={encoded_address_query}&dataset=DPA,LPI&matchprecision=10"
+            f"&key={self.api_key}"
+        )
+
         response = requests.get(url)
         if response.status_code == 200:
             data = response.json()
-            results = data['results']
+            res = data["results"]
+
+            if filter_by_postcode:
+                results = []
+                for r in res:
+                    if "DPA" in r:
+                        if r["DPA"]["POSTCODE"] == self.postcode:
+                            results.append(r)
+                    elif "LPI" in r:
+                        if r["LPI"]["POSTCODE_LOCATOR"] == self.postcode:
+                            results.append(r)
+                    else:
+                        raise ValueError("Could not find postcode in either DPA or LPI")
+            else:
+                results = res
+
             self.results = results
 
             # Extract some details about the best match
-            self.most_relevant_result = self.results[0]["DPA"]
+            self.most_relevant_result = self.results[0]["DPA"] if "DPA" in self.results[0] else self.results[0]["LPI"]
 
             self.parse_classification_code(self.most_relevant_result["CLASSIFICATION_CODE"])
             self.set_places_address()
@@ -99,6 +122,9 @@ class OrdnanceSuveyClient:
             'RD04': {'property_type': 'House', 'built_form': 'Mid-Terrace'},
             'RD06': {'property_type': 'Flat'},
         }
+        # Other classifications can be found in here:
+        # https://osdatahub.os.uk/docs/places/technicalSpecification in the CLASSIFICATION_CODE description.
+        # A lookup table csv can be downloaded which contains all of the codes
 
         mapped = value_map.get(classification_code, {})
         self.property_type = mapped.get("property_type", "")
diff --git a/etl/customers/stonewater/shdf_3_clustering.py b/etl/customers/stonewater/shdf_3_clustering.py
index 45b435ed..8a3725b9 100644
--- a/etl/customers/stonewater/shdf_3_clustering.py
+++ b/etl/customers/stonewater/shdf_3_clustering.py
@@ -3,6 +3,8 @@ from tqdm import tqdm
 import os
 from dotenv import load_dotenv
 from backend.SearchEpc import SearchEpc
+import urllib.parse
+import requests
 
 from fuzzywuzzy import fuzz
 import numpy as np
@@ -334,4 +336,170 @@ def app():
     # 4) Retrieveing additional data against the internal_id
     # 5) Creation of final dataset for clustering
 
+    os_most_relevant = []
+    os_all = {}
     for i in ["1", "2", "3"]:
+        most_relevant_segment = read_from_s3(
+            bucket_name="retrofit-data-dev",
+            s3_file_name=f"customers/Stonewater/clustering/os_most_relevant_{i}.json"
+        )
+        os_most_relevant.extend(json.loads(most_relevant_segment))
+        os_all_segment = read_from_s3(
+            bucket_name="retrofit-data-dev",
+            s3_file_name=f"customers/Stonewater/clustering/os_all_{i}.json"
+        )
+        os_all = {**os_all, **json.loads(os_all_segment)}
+
+    os_most_relevant = pd.DataFrame(os_most_relevant)
+
+    os_address_comparison = os_data_pull_asset_list[
+        ["internal_id", "full_address", "postcode", "house_number", "address1"]
+    ].merge(
+        os_most_relevant[["internal_id", "ADDRESS", "POSTCODE", "UPRN"]],
+        how="inner",
+        on="internal_id"
+    )
+
+    # Compare house number
+    # Check for records where the postcode doesn't match
+    os_address_comparison["postcodes_match"] = (
+        os_address_comparison["postcode"].str.lower() == os_address_comparison["POSTCODE"].str.lower()
+    )
+
+    # extract it from ADDRESS
+    os_address_comparison["extracted_house_number"] = os_address_comparison["ADDRESS"].apply(
+        lambda x: SearchEpc.get_house_number(x)
+    )
+
+    # Compare house number
+    os_address_comparison["house_numbers_match"] = (
+        os_address_comparison["house_number"].str.lower() == os_address_comparison["extracted_house_number"].str.lower()
+    )
+
+    # String similarity
+    os_address_comparison["address_similarity_score"] = os_address_comparison.apply(
+        lambda x: fuzz.ratio(
+            remove_commas_and_full_stops(x["full_address"].lower()),
+            remove_commas_and_full_stops(x["ADDRESS"].lower())
+        ),
+        axis=1
+    )
+
+    os_address_comparison = os_address_comparison.sort_values("address_similarity_score", ascending=True)
+
+    problematic = os_address_comparison.copy()
+
+    problematic = problematic[
+        (problematic["address_similarity_score"] <= 80) |
+        (~problematic["house_numbers_match"]) |
+        (~problematic["postcodes_match"])
+        ]
+
+    # TODO: We'll label these problematic records as problematic, in the final output
+
+    # different_postcodes = problematic[~problematic["postcodes_match"]].copy().reset_index(drop=True)
+
+    ORDNANCE_SURVEY_API_KEY = ""  # This API key is a temp key which I have copied locally
+    problematic_os = []
+    problematic_os_all = {}
+    problematic_errors = []
+    for _, row in tqdm(problematic.iterrows(), total=len(problematic)):
+        # Let's just do a backup pull - we're now using LPI too
+        time.sleep(2)
+        backup_searher = SearchEpc(
+            address1=row["address1"],
+            postcode=row["postcode"],
+            auth_token=EPC_AUTH_TOKEN,
+            os_api_key=ORDNANCE_SURVEY_API_KEY,
+            uprn=None,
+        )
+        # Attempt to get places data with retry logic
+        result = get_places_with_retry(backup_searher)
+
+        if result:
+            # Get the most relevant response
+            problematic_os.append(
+                {
+                    "internal_id": row["internal_id"],
+                    **backup_searher.ordnance_survey_client.most_relevant_result
+                }
+            )
+
+            # Also keep the best 100 results
+            problematic_os_all[row["internal_id"]] = backup_searher.ordnance_survey_client.results
+        else:
+            # Record the internal_id of the asset that failed
+            print("Error for address: " + row["full_address"])
+            problematic_errors.append(row["internal_id"])
+
+    # Store to S3
+    save_data_to_s3(
+        data=json.dumps(problematic_os),
+        s3_file_name="customers/Stonewater/clustering/problematic_os.json",
+        bucket_name="retrofit-data-dev"
+    )
+
+    save_data_to_s3(
+        data=json.dumps(problematic_os_all),
+        s3_file_name="customers/Stonewater/clustering/problematic_os_all.json",
+        bucket_name="retrofit-data-dev"
+    )
+
+    save_data_to_s3(
+        data=json.dumps(problematic_errors),
+        s3_file_name="customers/Stonewater/clustering/problematic_errors.json",
+        bucket_name="retrofit-data-dev"
+    )
+
+    # Next steps: We should collate all of the data and produce 1 big dataset
+
+    problematic_os_df = pd.DataFrame(problematic_os)
+    problematic_address_comparison = problematic[["internal_id", "full_address", "postcode"]].merge(
+        problematic_os_df[["internal_id", "ADDRESS", "POSTCODE", "UPRN"]],
+        how="inner",
+        on="internal_id"
+    )
+
+    problematic_address_comparison["OS_POSTCODE"] = problematic_address_comparison["ADDRESS"].str.split(", ").str[-1]
+    problematic_address_comparison["postcodes_match"] = (
+        problematic_address_comparison["postcode"].str.lower() == problematic_address_comparison[
+        "OS_POSTCODE"].str.lower()
+    )
+
+    problematic_address_comparison["match_similarity_score"] = problematic_address_comparison.apply(
+        lambda x: fuzz.ratio(
+            remove_commas_and_full_stops(x["full_address"].lower()),
+            remove_commas_and_full_stops(x["ADDRESS"].lower())
+        ),
+        axis=1
+    )
+    problematic_address_comparison = problematic_address_comparison.sort_values("match_similarity_score",
+                                                                                ascending=True)
+
+    # We perform a final check
+    final_check = problematic_address_comparison[
+        (problematic_address_comparison["match_similarity_score"] <= 90) |
+        (~problematic_address_comparison["postcodes_match"])
+        ]
+
+    final_best_matches = []
+    for _, row in final_check.iterrows():
+        os_data = problematic_os_all[row["internal_id"]]
+        os_data = pd.DataFrame(
+            [x["DPA"] if "DPA" in x else x["LPI"] for x in os_data]
+        )
+        os_data["postcode"] = np.where(
+            ~pd.isnull(os_data["POSTCODE"]),
+            os_data["POSTCODE"],
+            os_data["POSTCODE_LOCATOR"]
+        )
+        os_data = os_data[os_data["postcode"].str.lower() == row["postcode"].lower()]
+        if os_data.shape[0] == 1:
+            final_best_matches.append(
+                {
+                    "internal_id": row["internal_id"],
+                    **os_data.iloc[0].to_dict()
+                }
+            )
+        else:
+            blah