corrected spelling of built forms

2026-06-08 11:17:27 +00:00 · 2024-06-13 01:13:19 +01:00 · 2024-06-13 01:13:19 +01:00 · 6f9a78cabc
commit 6f9a78cabc
parent 5e84967ee0
2 changed files with 106 additions and 121 deletions
--- a/backend/OrdnanceSurvey.py
+++ b/backend/OrdnanceSurvey.py
@ -117,8 +117,8 @@ class OrdnanceSuveyClient:
        value_map = {
            # In the OS api, "RD" is a "Dwelling" however this is not valid property type in the EPC database
            'RD': {},
-            'RD02': {'property_type': 'House', 'built_form': 'Detatched'},
+            'RD02': {'property_type': 'House', 'built_form': 'Detached'},
-            'RD03': {'property_type': 'House', 'built_form': 'Semi-Detatched'},
+            'RD03': {'property_type': 'House', 'built_form': 'Semi-Detached'},
            'RD04': {'property_type': 'House', 'built_form': 'Mid-Terrace'},
            'RD06': {'property_type': 'Flat'},
        }
--- a/etl/customers/stonewater/shdf_3_clustering.py
+++ b/etl/customers/stonewater/shdf_3_clustering.py
@ -1285,7 +1285,7 @@ def compile_data_final():
        elif x["option"] == 2:
            uprn = x["os_option_2_uprn"]
            standardised_address = x["os_option_2_address"]
-            postcode = x["os_option_2_postcode"]
+            postcode = x["os_option_2_address"].split(", ")[-1]
        else:
            uprn = x["manual_uprn"]
            standardised_address = x["manual_address"]
@ -1347,7 +1347,8 @@ def compile_data_final():
            "City/Town",
            "County",
            "Address ID",  # This is not uprn
-            "udprn"
+            "udprn",
            "Owning body"
        ]
    ].rename(
        columns={
@ -1360,6 +1361,7 @@ def compile_data_final():
            "City/Town": "city_town",
            "County": "county",
            "Address ID": "external_address_id",
            "Owning body": "owner"
        }
    )
@ -1400,59 +1402,117 @@ def compile_data_final():
        on=["internal_id", "external_address_id"]
    )
-    # This is everything without a uprn
+    # Store locally
-    missing_uprn = asset_list[pd.isnull(asset_list["uprn"])]
+    # asset_list.to_excel("Stonewater asset list with uprn.xlsx")
-    missing_uprn_with_udprn = missing_uprn[
+    # We take just domestic properties
        missing_uprn["udprn"] != "<NA>"
        ].reset_index(drop=True)
-    missing_uprn_without_udprn = missing_uprn[
+    # This is the first ordnance survey data pull
-        missing_uprn["udprn"] == "<NA>"
+    os_most_relevant_1 = []
-        ].reset_index(drop=True)
+    os_all_1 = {}
    for i in tqdm(["1", "2", "3"]):
        most_relevant_segment = read_from_s3(
            bucket_name="retrofit-data-dev",
            s3_file_name=f"customers/Stonewater/clustering/os_most_relevant_{i}.json"
        )
        os_most_relevant_1.extend(json.loads(most_relevant_segment))
        os_all_segment = read_from_s3(
            bucket_name="retrofit-data-dev",
            s3_file_name=f"customers/Stonewater/clustering/os_all_{i}.json"
        )
        os_all_1 = {**os_all_1, **json.loads(os_all_segment)}
-    missing_uprn_without_udprn = missing_uprn_without_udprn[["internal_id", "external_address_id", "full_address"]]
+    os_most_relevant_1 = pd.DataFrame(os_most_relevant_1)
    # Pull in the best ordnance survey data for each one and manually fix
    manua_fix = []
    for _, x in missing_uprn_without_udprn.iterrows():
        internal_id = x["internal_id"]
-        os_option_1_address = ""
+    # This is the second ordnance survey data pull
-        os_option_1_postcode = ""
+    os_most_relevant_2 = read_from_s3(
-        os_option_1_uprn = ""
+        bucket_name="retrofit-data-dev",
-        if internal_id in os_most_relevant_1_internal_ids:
+        s3_file_name="customers/Stonewater/clustering/problematic_os.json"
-            p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id]
+    )
-            os_option_1_address = p_os_data["ADDRESS"].values[0]
+    os_most_relevant_2 = json.loads(os_most_relevant_2)
-            os_option_1_postcode = p_os_data["POSTCODE"].values[0]
+    os_most_relevant_2 = pd.DataFrame(os_most_relevant_2)
            os_option_1_uprn = p_os_data["UPRN"].values[0]
-        os_option_2_address = ""
+    os_all_2 = read_from_s3(
-        os_option_2_postcode = ""
+        bucket_name="retrofit-data-dev",
-        os_option_2_uprn = ""
+        s3_file_name="customers/Stonewater/clustering/problematic_os_all.json"
-        if internal_id in os_most_relevant_2_internal_ids:
+    )
-            p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id]
+    os_all_2 = json.loads(os_all_2)
            os_option_2_address = p_os_data["ADDRESS"].values[0]
            os_option_2_postcode = p_os_data["POSTCODE"].values[0]
            os_option_2_uprn = p_os_data["UPRN"].values[0]
-        manua_fix.append(
+    needs_epc_data = asset_list[~asset_list["internal_id"].isin(epc_data["internal_id"])]
    os_1_ids = os_most_relevant_1["internal_id"].values
    os_2_ids = os_most_relevant_2["internal_id"].values
    epc_data_batch_2 = []
    older_epcs_batch_2 = {}
    for _, property in tqdm(needs_epc_data.iterrows(), total=len(needs_epc_data)):
        if pd.isnull(property["uprn"]):
            continue
        searcher = SearchEpc(
            address1=", ".join(property["standardised_address"].split(", ")[:-1]),
            postcode=property["standardised_postcode"],
            auth_token=EPC_AUTH_TOKEN,
            os_api_key="",
            full_address=property["standardised_address"],
            uprn=property["uprn"]
        )
        searcher.find_property(skip_os=True)
        if searcher.newest_epc is None and property["match_type"] == "Exact":
            # Estimate!
            # Get the OS data
            p_os_df = pd.DataFrame()
            if property["internal_id"] in os_1_ids:
                p_os_df = pd.DataFrame(
                    [x["DPA"] if "DPA" in x else x["LPI"] for x in os_all_1[str(property["internal_id"])]]
                )
                p_os_df = p_os_df[p_os_df["UPRN"].astype(str) == property["uprn"]]
            if p_os_df.empty:
                p_os_df = pd.DataFrame(
                    [x["DPA"] if "DPA" in x else x["LPI"] for x in os_all_2[str(property["internal_id"])]]
                )
                p_os_df = p_os_df[p_os_df["UPRN"] == property["uprn"]]
            searcher.ordnance_survey_client.parse_classification_code(p_os_df["CLASSIFICATION_CODE"].values[0])
            # Now we estimate
            searcher.newest_epc = searcher.estimate_epc(
                property_type=searcher.ordnance_survey_client.property_type,
                built_form=searcher.ordnance_survey_client.built_form,
                lmks_to_drop=None,
                exclude_old=True
            )
        elif searcher.newest_epc is None and property["match_type"] == "Fuzzy":
            if "flat" in property["standardised_address"].lower():
                searcher.newest_epc = searcher.estimate_epc(
                    property_type="Flat",
                    built_form=None,
                    lmks_to_drop=None,
                    exclude_old=True
                )
            else:
                searcher.newest_epc = searcher.estimate_epc(
                    property_type="House",
                    built_form=None,
                    lmks_to_drop=None,
                    exclude_old=True
                )
        epc_data_batch_2.append(
            {
-                **x.to_dict(),
+                "internal_id": property["internal_id"],
-                "os_option_1_address": os_option_1_address,
+                **searcher.newest_epc
                "os_option_1_postcode": os_option_1_postcode,
                "os_option_1_uprn": os_option_1_uprn,
                "os_option_2_address": os_option_2_address,
                "os_option_2_postcode": os_option_2_postcode,
                "os_option_2_uprn": os_option_2_uprn,
            }
        )
-    manua_fix = pd.DataFrame(manua_fix)
+        if searcher.older_epcs is not None:
-    # manua_fix.to_csv("manual_fix_uprns.csv")
+            older_epcs_batch_2[property["internal_id"]] = searcher.older_epcs
-    # Split into chunks of 200
+
-    api_key = "ak_lxcapii7HnEhGKxuVmPquzTYKu9vp"
+def pull_ideal_postcodes(missing_uprn_with_udprn):
    api_key = ""  # Log into the platform the get the API key: https://account.ideal-postcodes.co.uk/
    import requests
    import time
    completed_id = 0
@ -1484,78 +1544,3 @@ def compile_data_final():
            result["result"]
        )
        completed_id += 1
    # Store in S3
    # save_data_to_s3(
    #     data=json.dumps(uprn_to_udprn),
    #     s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json",
    #     bucket_name="retrofit-data-dev"
    # )
    test = read_from_s3(
        s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json",
        bucket_name="retrofit-data-dev"
    )
    test = pd.DataFrame(json.loads(test))
    for _, x in missing_uprn.iterrows():
        udprn = x["udprn"]
        udprn = None if udprn == "<NA>" else udprn
        internal_id = x["internal_id"]
        is_flat = "flat" in x["address1"].lower()
        # Get the OS data
        final_os_data = pd.DataFrame()
        if internal_id in os_most_relevant_1_internal_ids:
            p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id]
            p_os_data_all = os_all_1[str(internal_id)]
            final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn, is_flat)
        if (internal_id in os_most_relevant_2_internal_ids) and final_os_data.empty:
            p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id]
            p_os_data_all = os_all_2[str(internal_id)]
            final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn, is_flat)
        # Try signing up on a free trial with these guys!
        # https://ideal-postcodes.co.uk/pricing
        # API example: https://docs.ideal-postcodes.co.uk/docs/api/udprn
        if final_os_data.empty:
            boo
            continue
        if final_os_data.shape[0] != 1:
            if final_os_data["UPRN"].nunique() > 1:
                raise Exception("Investigate me")
    # TODO: We should do a different variation of similarity, where we strip out "Flat" and "Room x" if they are there
    # This is the first ordnance survey data pull
    os_most_relevant_1 = []
    os_all_1 = {}
    for i in tqdm(["1", "2", "3"]):
        most_relevant_segment = read_from_s3(
            bucket_name="retrofit-data-dev",
            s3_file_name=f"customers/Stonewater/clustering/os_most_relevant_{i}.json"
        )
        os_most_relevant_1.extend(json.loads(most_relevant_segment))
        os_all_segment = read_from_s3(
            bucket_name="retrofit-data-dev",
            s3_file_name=f"customers/Stonewater/clustering/os_all_{i}.json"
        )
        os_all_1 = {**os_all_1, **json.loads(os_all_segment)}
    os_most_relevant_1 = pd.DataFrame(os_most_relevant_1)
    # This is the second ordnance survey data pull
    os_most_relevant_2 = read_from_s3(
        bucket_name="retrofit-data-dev",
        s3_file_name="customers/Stonewater/clustering/problematic_os.json"
    )
    os_most_relevant_2 = json.loads(os_most_relevant_2)
    os_most_relevant_2 = pd.DataFrame(os_most_relevant_2)
    os_all_2 = read_from_s3(
        bucket_name="retrofit-data-dev",
        s3_file_name="customers/Stonewater/clustering/problematic_os_all.json"
    )
    os_all_2 = json.loads(os_all_2)