corrected spelling of built forms

2026-07-27 23:35:01 +00:00 · 2024-06-13 01:13:19 +01:00 · 2024-06-13 01:13:19 +01:00 · 6f9a78cabc
commit 6f9a78cabc
parent 5e84967ee0
2 changed files with 106 additions and 121 deletions
--- a/backend/OrdnanceSurvey.py
+++ b/backend/OrdnanceSurvey.py
@ -117,8 +117,8 @@ class OrdnanceSuveyClient:
        value_map = {
            # In the OS api, "RD" is a "Dwelling" however this is not valid property type in the EPC database
            'RD': {},
-            'RD02': {'property_type': 'House', 'built_form': 'Detatched'},
-            'RD03': {'property_type': 'House', 'built_form': 'Semi-Detatched'},
+            'RD02': {'property_type': 'House', 'built_form': 'Detached'},
+            'RD03': {'property_type': 'House', 'built_form': 'Semi-Detached'},
            'RD04': {'property_type': 'House', 'built_form': 'Mid-Terrace'},
            'RD06': {'property_type': 'Flat'},
        }
--- a/etl/customers/stonewater/shdf_3_clustering.py
+++ b/etl/customers/stonewater/shdf_3_clustering.py
@ -1285,7 +1285,7 @@ def compile_data_final():
        elif x["option"] == 2:
            uprn = x["os_option_2_uprn"]
            standardised_address = x["os_option_2_address"]
-            postcode = x["os_option_2_postcode"]
+            postcode = x["os_option_2_address"].split(", ")[-1]
        else:
            uprn = x["manual_uprn"]
            standardised_address = x["manual_address"]
@ -1347,7 +1347,8 @@ def compile_data_final():
            "City/Town",
            "County",
            "Address ID",  # This is not uprn
-            "udprn"
+            "udprn",
+            "Owning body"
        ]
    ].rename(
        columns={
@ -1360,6 +1361,7 @@ def compile_data_final():
            "City/Town": "city_town",
            "County": "county",
            "Address ID": "external_address_id",
+            "Owning body": "owner"
        }
    )

@ -1400,59 +1402,117 @@ def compile_data_final():
        on=["internal_id", "external_address_id"]
    )

-    # This is everything without a uprn
-    missing_uprn = asset_list[pd.isnull(asset_list["uprn"])]
+    # Store locally
+    # asset_list.to_excel("Stonewater asset list with uprn.xlsx")

-    missing_uprn_with_udprn = missing_uprn[
-        missing_uprn["udprn"] != "<NA>"
-        ].reset_index(drop=True)
+    # We take just domestic properties

-    missing_uprn_without_udprn = missing_uprn[
-        missing_uprn["udprn"] == "<NA>"
-        ].reset_index(drop=True)
+    # This is the first ordnance survey data pull
+    os_most_relevant_1 = []
+    os_all_1 = {}
+    for i in tqdm(["1", "2", "3"]):
+        most_relevant_segment = read_from_s3(
+            bucket_name="retrofit-data-dev",
+            s3_file_name=f"customers/Stonewater/clustering/os_most_relevant_{i}.json"
+        )
+        os_most_relevant_1.extend(json.loads(most_relevant_segment))
+        os_all_segment = read_from_s3(
+            bucket_name="retrofit-data-dev",
+            s3_file_name=f"customers/Stonewater/clustering/os_all_{i}.json"
+        )
+        os_all_1 = {**os_all_1, **json.loads(os_all_segment)}

-    missing_uprn_without_udprn = missing_uprn_without_udprn[["internal_id", "external_address_id", "full_address"]]
-    # Pull in the best ordnance survey data for each one and manually fix
-    manua_fix = []
-    for _, x in missing_uprn_without_udprn.iterrows():
-        internal_id = x["internal_id"]
+    os_most_relevant_1 = pd.DataFrame(os_most_relevant_1)

-        os_option_1_address = ""
-        os_option_1_postcode = ""
-        os_option_1_uprn = ""
-        if internal_id in os_most_relevant_1_internal_ids:
-            p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id]
-            os_option_1_address = p_os_data["ADDRESS"].values[0]
-            os_option_1_postcode = p_os_data["POSTCODE"].values[0]
-            os_option_1_uprn = p_os_data["UPRN"].values[0]
+    # This is the second ordnance survey data pull
+    os_most_relevant_2 = read_from_s3(
+        bucket_name="retrofit-data-dev",
+        s3_file_name="customers/Stonewater/clustering/problematic_os.json"
+    )
+    os_most_relevant_2 = json.loads(os_most_relevant_2)
+    os_most_relevant_2 = pd.DataFrame(os_most_relevant_2)

-        os_option_2_address = ""
-        os_option_2_postcode = ""
-        os_option_2_uprn = ""
-        if internal_id in os_most_relevant_2_internal_ids:
-            p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id]
-            os_option_2_address = p_os_data["ADDRESS"].values[0]
-            os_option_2_postcode = p_os_data["POSTCODE"].values[0]
-            os_option_2_uprn = p_os_data["UPRN"].values[0]
+    os_all_2 = read_from_s3(
+        bucket_name="retrofit-data-dev",
+        s3_file_name="customers/Stonewater/clustering/problematic_os_all.json"
+    )
+    os_all_2 = json.loads(os_all_2)

-        manua_fix.append(
+    needs_epc_data = asset_list[~asset_list["internal_id"].isin(epc_data["internal_id"])]
+
+    os_1_ids = os_most_relevant_1["internal_id"].values
+    os_2_ids = os_most_relevant_2["internal_id"].values
+
+    epc_data_batch_2 = []
+    older_epcs_batch_2 = {}
+    for _, property in tqdm(needs_epc_data.iterrows(), total=len(needs_epc_data)):
+        if pd.isnull(property["uprn"]):
+            continue
+        searcher = SearchEpc(
+            address1=", ".join(property["standardised_address"].split(", ")[:-1]),
+            postcode=property["standardised_postcode"],
+            auth_token=EPC_AUTH_TOKEN,
+            os_api_key="",
+            full_address=property["standardised_address"],
+            uprn=property["uprn"]
+        )
+        searcher.find_property(skip_os=True)
+
+        if searcher.newest_epc is None and property["match_type"] == "Exact":
+            # Estimate!
+            # Get the OS data
+            p_os_df = pd.DataFrame()
+            if property["internal_id"] in os_1_ids:
+                p_os_df = pd.DataFrame(
+                    [x["DPA"] if "DPA" in x else x["LPI"] for x in os_all_1[str(property["internal_id"])]]
+                )
+                p_os_df = p_os_df[p_os_df["UPRN"].astype(str) == property["uprn"]]
+
+            if p_os_df.empty:
+                p_os_df = pd.DataFrame(
+                    [x["DPA"] if "DPA" in x else x["LPI"] for x in os_all_2[str(property["internal_id"])]]
+                )
+                p_os_df = p_os_df[p_os_df["UPRN"] == property["uprn"]]
+
+            searcher.ordnance_survey_client.parse_classification_code(p_os_df["CLASSIFICATION_CODE"].values[0])
+            # Now we estimate
+            searcher.newest_epc = searcher.estimate_epc(
+                property_type=searcher.ordnance_survey_client.property_type,
+                built_form=searcher.ordnance_survey_client.built_form,
+                lmks_to_drop=None,
+                exclude_old=True
+            )
+
+        elif searcher.newest_epc is None and property["match_type"] == "Fuzzy":
+
+            if "flat" in property["standardised_address"].lower():
+                searcher.newest_epc = searcher.estimate_epc(
+                    property_type="Flat",
+                    built_form=None,
+                    lmks_to_drop=None,
+                    exclude_old=True
+                )
+            else:
+                searcher.newest_epc = searcher.estimate_epc(
+                    property_type="House",
+                    built_form=None,
+                    lmks_to_drop=None,
+                    exclude_old=True
+                )
+
+        epc_data_batch_2.append(
            {
-                **x.to_dict(),
-                "os_option_1_address": os_option_1_address,
-                "os_option_1_postcode": os_option_1_postcode,
-                "os_option_1_uprn": os_option_1_uprn,
-
-                "os_option_2_address": os_option_2_address,
-                "os_option_2_postcode": os_option_2_postcode,
-                "os_option_2_uprn": os_option_2_uprn,
+                "internal_id": property["internal_id"],
+                **searcher.newest_epc
            }
        )

-    manua_fix = pd.DataFrame(manua_fix)
-    # manua_fix.to_csv("manual_fix_uprns.csv")
+        if searcher.older_epcs is not None:
+            older_epcs_batch_2[property["internal_id"]] = searcher.older_epcs

-    # Split into chunks of 200
-    api_key = "ak_lxcapii7HnEhGKxuVmPquzTYKu9vp"
+
+def pull_ideal_postcodes(missing_uprn_with_udprn):
+    api_key = ""  # Log into the platform the get the API key: https://account.ideal-postcodes.co.uk/
    import requests
    import time
    completed_id = 0
@ -1484,78 +1544,3 @@ def compile_data_final():
            result["result"]
        )
        completed_id += 1
-
-    # Store in S3
-    # save_data_to_s3(
-    #     data=json.dumps(uprn_to_udprn),
-    #     s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json",
-    #     bucket_name="retrofit-data-dev"
-    # )
-
-    test = read_from_s3(
-        s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json",
-        bucket_name="retrofit-data-dev"
-    )
-    test = pd.DataFrame(json.loads(test))
-
-    for _, x in missing_uprn.iterrows():
-        udprn = x["udprn"]
-        udprn = None if udprn == "<NA>" else udprn
-        internal_id = x["internal_id"]
-
-        is_flat = "flat" in x["address1"].lower()
-        # Get the OS data
-        final_os_data = pd.DataFrame()
-        if internal_id in os_most_relevant_1_internal_ids:
-            p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id]
-            p_os_data_all = os_all_1[str(internal_id)]
-            final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn, is_flat)
-
-        if (internal_id in os_most_relevant_2_internal_ids) and final_os_data.empty:
-            p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id]
-            p_os_data_all = os_all_2[str(internal_id)]
-            final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn, is_flat)
-
-        # Try signing up on a free trial with these guys!
-        # https://ideal-postcodes.co.uk/pricing
-        # API example: https://docs.ideal-postcodes.co.uk/docs/api/udprn
-
-        if final_os_data.empty:
-            boo
-            continue
-
-        if final_os_data.shape[0] != 1:
-            if final_os_data["UPRN"].nunique() > 1:
-                raise Exception("Investigate me")
-
-    # TODO: We should do a different variation of similarity, where we strip out "Flat" and "Room x" if they are there
-    # This is the first ordnance survey data pull
-    os_most_relevant_1 = []
-    os_all_1 = {}
-    for i in tqdm(["1", "2", "3"]):
-        most_relevant_segment = read_from_s3(
-            bucket_name="retrofit-data-dev",
-            s3_file_name=f"customers/Stonewater/clustering/os_most_relevant_{i}.json"
-        )
-        os_most_relevant_1.extend(json.loads(most_relevant_segment))
-        os_all_segment = read_from_s3(
-            bucket_name="retrofit-data-dev",
-            s3_file_name=f"customers/Stonewater/clustering/os_all_{i}.json"
-        )
-        os_all_1 = {**os_all_1, **json.loads(os_all_segment)}
-
-    os_most_relevant_1 = pd.DataFrame(os_most_relevant_1)
-
-    # This is the second ordnance survey data pull
-    os_most_relevant_2 = read_from_s3(
-        bucket_name="retrofit-data-dev",
-        s3_file_name="customers/Stonewater/clustering/problematic_os.json"
-    )
-    os_most_relevant_2 = json.loads(os_most_relevant_2)
-    os_most_relevant_2 = pd.DataFrame(os_most_relevant_2)
-
-    os_all_2 = read_from_s3(
-        bucket_name="retrofit-data-dev",
-        s3_file_name="customers/Stonewater/clustering/problematic_os_all.json"
-    )
-    os_all_2 = json.loads(os_all_2)