diff --git a/backend/OrdnanceSurvey.py b/backend/OrdnanceSurvey.py index 856dda7a..a4d716d0 100644 --- a/backend/OrdnanceSurvey.py +++ b/backend/OrdnanceSurvey.py @@ -117,8 +117,8 @@ class OrdnanceSuveyClient: value_map = { # In the OS api, "RD" is a "Dwelling" however this is not valid property type in the EPC database 'RD': {}, - 'RD02': {'property_type': 'House', 'built_form': 'Detatched'}, - 'RD03': {'property_type': 'House', 'built_form': 'Semi-Detatched'}, + 'RD02': {'property_type': 'House', 'built_form': 'Detached'}, + 'RD03': {'property_type': 'House', 'built_form': 'Semi-Detached'}, 'RD04': {'property_type': 'House', 'built_form': 'Mid-Terrace'}, 'RD06': {'property_type': 'Flat'}, } diff --git a/etl/customers/stonewater/shdf_3_clustering.py b/etl/customers/stonewater/shdf_3_clustering.py index 6723b86e..c7afa28d 100644 --- a/etl/customers/stonewater/shdf_3_clustering.py +++ b/etl/customers/stonewater/shdf_3_clustering.py @@ -1285,7 +1285,7 @@ def compile_data_final(): elif x["option"] == 2: uprn = x["os_option_2_uprn"] standardised_address = x["os_option_2_address"] - postcode = x["os_option_2_postcode"] + postcode = x["os_option_2_address"].split(", ")[-1] else: uprn = x["manual_uprn"] standardised_address = x["manual_address"] @@ -1347,7 +1347,8 @@ def compile_data_final(): "City/Town", "County", "Address ID", # This is not uprn - "udprn" + "udprn", + "Owning body" ] ].rename( columns={ @@ -1360,6 +1361,7 @@ def compile_data_final(): "City/Town": "city_town", "County": "county", "Address ID": "external_address_id", + "Owning body": "owner" } ) @@ -1400,59 +1402,117 @@ def compile_data_final(): on=["internal_id", "external_address_id"] ) - # This is everything without a uprn - missing_uprn = asset_list[pd.isnull(asset_list["uprn"])] + # Store locally + # asset_list.to_excel("Stonewater asset list with uprn.xlsx") - missing_uprn_with_udprn = missing_uprn[ - missing_uprn["udprn"] != "" - ].reset_index(drop=True) + # We take just domestic properties - missing_uprn_without_udprn = missing_uprn[ - missing_uprn["udprn"] == "" - ].reset_index(drop=True) + # This is the first ordnance survey data pull + os_most_relevant_1 = [] + os_all_1 = {} + for i in tqdm(["1", "2", "3"]): + most_relevant_segment = read_from_s3( + bucket_name="retrofit-data-dev", + s3_file_name=f"customers/Stonewater/clustering/os_most_relevant_{i}.json" + ) + os_most_relevant_1.extend(json.loads(most_relevant_segment)) + os_all_segment = read_from_s3( + bucket_name="retrofit-data-dev", + s3_file_name=f"customers/Stonewater/clustering/os_all_{i}.json" + ) + os_all_1 = {**os_all_1, **json.loads(os_all_segment)} - missing_uprn_without_udprn = missing_uprn_without_udprn[["internal_id", "external_address_id", "full_address"]] - # Pull in the best ordnance survey data for each one and manually fix - manua_fix = [] - for _, x in missing_uprn_without_udprn.iterrows(): - internal_id = x["internal_id"] + os_most_relevant_1 = pd.DataFrame(os_most_relevant_1) - os_option_1_address = "" - os_option_1_postcode = "" - os_option_1_uprn = "" - if internal_id in os_most_relevant_1_internal_ids: - p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id] - os_option_1_address = p_os_data["ADDRESS"].values[0] - os_option_1_postcode = p_os_data["POSTCODE"].values[0] - os_option_1_uprn = p_os_data["UPRN"].values[0] + # This is the second ordnance survey data pull + os_most_relevant_2 = read_from_s3( + bucket_name="retrofit-data-dev", + s3_file_name="customers/Stonewater/clustering/problematic_os.json" + ) + os_most_relevant_2 = json.loads(os_most_relevant_2) + os_most_relevant_2 = pd.DataFrame(os_most_relevant_2) - os_option_2_address = "" - os_option_2_postcode = "" - os_option_2_uprn = "" - if internal_id in os_most_relevant_2_internal_ids: - p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id] - os_option_2_address = p_os_data["ADDRESS"].values[0] - os_option_2_postcode = p_os_data["POSTCODE"].values[0] - os_option_2_uprn = p_os_data["UPRN"].values[0] + os_all_2 = read_from_s3( + bucket_name="retrofit-data-dev", + s3_file_name="customers/Stonewater/clustering/problematic_os_all.json" + ) + os_all_2 = json.loads(os_all_2) - manua_fix.append( + needs_epc_data = asset_list[~asset_list["internal_id"].isin(epc_data["internal_id"])] + + os_1_ids = os_most_relevant_1["internal_id"].values + os_2_ids = os_most_relevant_2["internal_id"].values + + epc_data_batch_2 = [] + older_epcs_batch_2 = {} + for _, property in tqdm(needs_epc_data.iterrows(), total=len(needs_epc_data)): + if pd.isnull(property["uprn"]): + continue + searcher = SearchEpc( + address1=", ".join(property["standardised_address"].split(", ")[:-1]), + postcode=property["standardised_postcode"], + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + full_address=property["standardised_address"], + uprn=property["uprn"] + ) + searcher.find_property(skip_os=True) + + if searcher.newest_epc is None and property["match_type"] == "Exact": + # Estimate! + # Get the OS data + p_os_df = pd.DataFrame() + if property["internal_id"] in os_1_ids: + p_os_df = pd.DataFrame( + [x["DPA"] if "DPA" in x else x["LPI"] for x in os_all_1[str(property["internal_id"])]] + ) + p_os_df = p_os_df[p_os_df["UPRN"].astype(str) == property["uprn"]] + + if p_os_df.empty: + p_os_df = pd.DataFrame( + [x["DPA"] if "DPA" in x else x["LPI"] for x in os_all_2[str(property["internal_id"])]] + ) + p_os_df = p_os_df[p_os_df["UPRN"] == property["uprn"]] + + searcher.ordnance_survey_client.parse_classification_code(p_os_df["CLASSIFICATION_CODE"].values[0]) + # Now we estimate + searcher.newest_epc = searcher.estimate_epc( + property_type=searcher.ordnance_survey_client.property_type, + built_form=searcher.ordnance_survey_client.built_form, + lmks_to_drop=None, + exclude_old=True + ) + + elif searcher.newest_epc is None and property["match_type"] == "Fuzzy": + + if "flat" in property["standardised_address"].lower(): + searcher.newest_epc = searcher.estimate_epc( + property_type="Flat", + built_form=None, + lmks_to_drop=None, + exclude_old=True + ) + else: + searcher.newest_epc = searcher.estimate_epc( + property_type="House", + built_form=None, + lmks_to_drop=None, + exclude_old=True + ) + + epc_data_batch_2.append( { - **x.to_dict(), - "os_option_1_address": os_option_1_address, - "os_option_1_postcode": os_option_1_postcode, - "os_option_1_uprn": os_option_1_uprn, - - "os_option_2_address": os_option_2_address, - "os_option_2_postcode": os_option_2_postcode, - "os_option_2_uprn": os_option_2_uprn, + "internal_id": property["internal_id"], + **searcher.newest_epc } ) - manua_fix = pd.DataFrame(manua_fix) - # manua_fix.to_csv("manual_fix_uprns.csv") + if searcher.older_epcs is not None: + older_epcs_batch_2[property["internal_id"]] = searcher.older_epcs - # Split into chunks of 200 - api_key = "ak_lxcapii7HnEhGKxuVmPquzTYKu9vp" + +def pull_ideal_postcodes(missing_uprn_with_udprn): + api_key = "" # Log into the platform the get the API key: https://account.ideal-postcodes.co.uk/ import requests import time completed_id = 0 @@ -1484,78 +1544,3 @@ def compile_data_final(): result["result"] ) completed_id += 1 - - # Store in S3 - # save_data_to_s3( - # data=json.dumps(uprn_to_udprn), - # s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json", - # bucket_name="retrofit-data-dev" - # ) - - test = read_from_s3( - s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json", - bucket_name="retrofit-data-dev" - ) - test = pd.DataFrame(json.loads(test)) - - for _, x in missing_uprn.iterrows(): - udprn = x["udprn"] - udprn = None if udprn == "" else udprn - internal_id = x["internal_id"] - - is_flat = "flat" in x["address1"].lower() - # Get the OS data - final_os_data = pd.DataFrame() - if internal_id in os_most_relevant_1_internal_ids: - p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id] - p_os_data_all = os_all_1[str(internal_id)] - final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn, is_flat) - - if (internal_id in os_most_relevant_2_internal_ids) and final_os_data.empty: - p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id] - p_os_data_all = os_all_2[str(internal_id)] - final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn, is_flat) - - # Try signing up on a free trial with these guys! - # https://ideal-postcodes.co.uk/pricing - # API example: https://docs.ideal-postcodes.co.uk/docs/api/udprn - - if final_os_data.empty: - boo - continue - - if final_os_data.shape[0] != 1: - if final_os_data["UPRN"].nunique() > 1: - raise Exception("Investigate me") - - # TODO: We should do a different variation of similarity, where we strip out "Flat" and "Room x" if they are there - # This is the first ordnance survey data pull - os_most_relevant_1 = [] - os_all_1 = {} - for i in tqdm(["1", "2", "3"]): - most_relevant_segment = read_from_s3( - bucket_name="retrofit-data-dev", - s3_file_name=f"customers/Stonewater/clustering/os_most_relevant_{i}.json" - ) - os_most_relevant_1.extend(json.loads(most_relevant_segment)) - os_all_segment = read_from_s3( - bucket_name="retrofit-data-dev", - s3_file_name=f"customers/Stonewater/clustering/os_all_{i}.json" - ) - os_all_1 = {**os_all_1, **json.loads(os_all_segment)} - - os_most_relevant_1 = pd.DataFrame(os_most_relevant_1) - - # This is the second ordnance survey data pull - os_most_relevant_2 = read_from_s3( - bucket_name="retrofit-data-dev", - s3_file_name="customers/Stonewater/clustering/problematic_os.json" - ) - os_most_relevant_2 = json.loads(os_most_relevant_2) - os_most_relevant_2 = pd.DataFrame(os_most_relevant_2) - - os_all_2 = read_from_s3( - bucket_name="retrofit-data-dev", - s3_file_name="customers/Stonewater/clustering/problematic_os_all.json" - ) - os_all_2 = json.loads(os_all_2)