From 5e84967ee02fa5aa740426350290ae300b5381df Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 13 Jun 2024 00:26:22 +0100 Subject: [PATCH] merging asset list with uprns for stonewater --- etl/customers/stonewater/shdf_3_clustering.py | 685 +++++++++++++++++- 1 file changed, 660 insertions(+), 25 deletions(-) diff --git a/etl/customers/stonewater/shdf_3_clustering.py b/etl/customers/stonewater/shdf_3_clustering.py index 44043206..6723b86e 100644 --- a/etl/customers/stonewater/shdf_3_clustering.py +++ b/etl/customers/stonewater/shdf_3_clustering.py @@ -5,6 +5,7 @@ from dotenv import load_dotenv from backend.SearchEpc import SearchEpc import urllib.parse import requests +from datetime import datetime from fuzzywuzzy import fuzz import numpy as np @@ -631,6 +632,23 @@ def app(): # "Address ID": "external_address_id", +def filter_os_data(p_os_data, p_os_data_all, udprn, is_flat): + if udprn is None: + p_os_data_all = pd.DataFrame([z["DPA"] if "DPA" in z else z["LPI"] for z in p_os_data_all]) + if is_flat: + p_os_data_all = p_os_data_all[p_os_data_all["CLASSIFICATION_CODE"] == "RD06"] + return p_os_data_all.head(1) + + return p_os_data_all.head(1) + + final_os_data = p_os_data[p_os_data["UDPRN"] == udprn] + if final_os_data.empty: + p_os_data_all = pd.DataFrame([z["DPA"] if "DPA" in z else z["LPI"] for z in p_os_data_all]) + final_os_data = p_os_data_all[p_os_data_all["UDPRN"].astype(str) == udprn] + + return final_os_data + + def compile_data(): """ Various data sources have been produced to create the final data source for Stonewater. @@ -640,13 +658,53 @@ def compile_data(): ######################################################################## # Read in data ######################################################################## - asset_list = read_excel_from_s3( - file_key="customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx", - bucket_name="retrofit-data-dev", - header_row=4 + # asset_list = read_excel_from_s3( + # file_key="customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx", + # bucket_name="retrofit-data-dev", + # header_row=4 + # ) + # + # udprn_data = read_excel_from_s3( + # file_key="customers/Stonewater/UDPRN updated RA Sample for 5 year programme.xlsx", + # bucket_name="retrofit-data-dev", + # header_row=0 + # )[["AddressId", "UDPRN"]].rename(columns={"AddressId": "external_address_id"}) + + asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Downloads/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx", header=4 ) + udprn_data = pd.read_excel( + "/Users/khalimconn-kowlessar/Downloads/UDPRN updated RA Sample for 5 year programme.xlsx", header=0 + )[["AddressId", "UDPRN"]].rename(columns={"AddressId": "Address ID"}) + udprn_data["UDPRN"] = udprn_data["UDPRN"].astype("Int64").astype(str) + udprn_data["Address ID"] = udprn_data["Address ID"].astype(str) + + asset_list = asset_list.merge(udprn_data, how="inner", on="Address ID") + asset_list = asset_list.rename(columns={"UDPRN": "udprn"}) + + # Read in the lookups + uprn_lookup_1 = pd.DataFrame(json.loads(read_from_s3( + bucket_name="retrofit-data-dev", + s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup.json" + ))) + + uprn_lookup_2 = pd.DataFrame(json.loads(read_from_s3( + bucket_name="retrofit-data-dev", + s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup_2.json" + ))) + uprn_lookup_2 = uprn_lookup_2.rename( + columns={ + "epc_address": "standardised_address", + "epc_postcode": "standardised_postcode" + } + ) + + # concat + uprn_lookup = pd.concat([uprn_lookup_1, uprn_lookup_2]) + # TODO: Read in UPRNs or UDPRN + # UPRN LOOKUPS TO READ IN: address_uprn_udprn_lookup, address_uprn_udprn_lookup_2 epc_data = json.loads( read_from_s3( @@ -660,13 +718,13 @@ def compile_data(): epc_data = epc_data[~epc_data["internal_id"].isin(internal_id_epcs_to_drop)] # This we can use to produce additional variables such as number of old surveys - older_epc_data = json.loads( - read_from_s3( - bucket_name="retrofit-data-dev", - s3_file_name="customers/Stonewater/clustering/old_epc_data.json" - ) - ) - older_epc_data = {k: v for k, v in older_epc_data.items() if k not in internal_id_epcs_to_drop} + # older_epc_data = json.loads( + # read_from_s3( + # bucket_name="retrofit-data-dev", + # s3_file_name="customers/Stonewater/clustering/old_epc_data.json" + # ) + # ) + # older_epc_data = {k: v for k, v in older_epc_data.items() if k not in internal_id_epcs_to_drop} # This is the first ordnance survey data pull os_most_relevant_1 = [] @@ -703,8 +761,6 @@ def compile_data(): # Prepare asset list ######################################################################## # TODO: Merge on UPRNs - # Drop the bottom 4 rows, which are completely missing - asset_list = asset_list.head(-4) # Keep just the columns we're interested in asset_list = asset_list[ @@ -718,6 +774,7 @@ def compile_data(): "City/Town", "County", "Address ID", # This is not uprn + "udprn" ] ].rename( columns={ @@ -752,8 +809,17 @@ def compile_data(): if pd.isnull(asset_list["full_address"]).sum(): raise ValueError("Missing full addresses") + # Merge on UDPRN + + asset_list = asset_list.merge( + uprn_lookup.drop(columns=["udprn"]), how="left", on=["internal_id", "external_address_id"] + ) + + # This is everything without a uprn + # Quick check to see if we have os data for every property that doesn't have an EPC without_epc = asset_list[~asset_list["internal_id"].isin(epc_data["internal_id"].values)] + os_most_relevant_1_internal_ids = os_most_relevant_1["internal_id"].tolist() os_most_relevant_2_internal_ids = os_most_relevant_2["internal_id"].tolist() @@ -773,14 +839,124 @@ def compile_data(): if len(missing_os_data): raise Exception("We don't have SOME data for each internal_id") - # For the EPC data, some of them are missing UPRN - epc_data_to_address = asset_list[ - asset_list["internal_id"].isin(epc_data["internal_id"].values) - ][ - ["full_address", "internal_id"]].merge( - epc_data, how="left", on="internal_id" + # Let's create a lookup table of internal_id, external_address_id, udprn, uprn, standardised_address + address_uprn_udprn_lookup = [] + for _, x in without_epc.iterrows(): + if pd.isnull(x["UDPRN"]): + continue + udprn = str(int(x["UDPRN"])) + internal_id = x["internal_id"] + + is_flat = "flat" in x["address1"].lower() + + # Get the OS data + final_os_data = pd.DataFrame() + if internal_id in os_most_relevant_1_internal_ids: + p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id] + p_os_data_all = os_all_1[str(internal_id)] + final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn, is_flat) + + if (internal_id in os_most_relevant_2_internal_ids) and final_os_data.empty: + p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id] + p_os_data_all = os_all_2[str(internal_id)] + + final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn, is_flat) + + if final_os_data.empty: + continue + + if final_os_data.shape[0] != 1: + if final_os_data["UPRN"].nunique() > 1: + raise Exception("Investigate me") + + address_uprn_udprn_lookup.append( + { + "internal_id": internal_id, + "external_address_id": x["external_address_id"], + "udprn": udprn, + "uprn": final_os_data["UPRN"].values[0], + "standardised_address": final_os_data["ADDRESS"].values[0], + "standardised_postcode": final_os_data["POSTCODE"].values[0] + } + ) + + # Store this lookup + # save_data_to_s3( + # data=json.dumps(address_uprn_udprn_lookup), + # s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup.json", + # bucket_name="retrofit-data-dev" + # ) + + address_uprn_udprn_lookup = pd.DataFrame(address_uprn_udprn_lookup) + missed = asset_list[~asset_list["internal_id"].isin(address_uprn_udprn_lookup["internal_id"].values)] + + address_comparison = ( + asset_list[ + ["internal_id", "external_address_id", "UDPRN", "full_address", "postcode", "house_number", "address1"] + ].merge( + epc_data[["internal_id", "address", "postcode", "address1", "uprn"]].rename( + columns={ + "address": "epc_address", + "postcode": "epc_postcode", + "address1": "epc_address1" + } + ), + how="inner", + on="internal_id" + ) ) - missed_uprn = epc_data_to_address[epc_data_to_address["uprn"] == ""] + + address_comparison["address_similarity_score"] = address_comparison.apply( + lambda x: fuzz.ratio( + remove_commas_and_full_stops(x["address1"].lower() + x["postcode"].lower()), + remove_commas_and_full_stops(x["epc_address1"].lower() + x["epc_postcode"].lower()) + ), + axis=1 + ) + address_comparison = address_comparison.sort_values("address_similarity_score", ascending=False) + # Cond + confident = address_comparison[address_comparison["address_similarity_score"] >= 95] + low_confidence = address_comparison[address_comparison["address_similarity_score"] < 95].copy() + + lookup_2 = confident[ + [ + 'internal_id', 'external_address_id', 'UDPRN', 'uprn', + 'epc_address', 'epc_postcode'] + ].rename(columns={"UDPRN": "udprn"}) + + # Store in S3 + # save_data_to_s3( + # data=json.dumps(lookup_2.to_dict("records")), + # s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup_2.json", + # bucket_name="retrofit-data-dev" + # ) + + # Need to deal with the low confidence records + low_confidence_asset_list = asset_list[asset_list["internal_id"].isin(low_confidence["internal_id"])] + for _, x in low_confidence_asset_list.iterrows(): + udprn = str(int(x["UDPRN"])) + internal_id = x["internal_id"] + # Get the OS data + final_os_data = pd.DataFrame() + if internal_id in os_most_relevant_1_internal_ids: + p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id] + p_os_data_all = os_all_1[str(internal_id)] + final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn) + + if (internal_id in os_most_relevant_2_internal_ids) and final_os_data.empty: + p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id] + p_os_data_all = os_all_2[str(internal_id)] + + final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn) + + # For the EPC data, some of them are missing UPRN + epc_data = epc_data.merge(missing_uprn_map, how="left", on="internal_id") + epc_data["uprn"] = np.where( + epc_data["uprn"] == "", + epc_data["mapped_uprn"], + epc_data["uprn"] + ) + epc_data = epc_data.drop(columns=["mapped_uprn"]) # Once we have UPRNs, we might want to pull in the EPC data again # epc_data_with_uprn = [] @@ -864,8 +1040,7 @@ def compile_data(): ] ) - # We now pull additional data - uprns = complete_epcs["uprn"].tolist() + # We now pull spatial data # We get the spatial file list and loop through each EPC and determine which file it needs. # We then just read in the files that we need and get the data, for each uprn from that file @@ -875,7 +1050,7 @@ def compile_data(): uprn_lookup = {} for uprn in complete_epcs["uprn"]: - if not uprn: + if pd.isnull(uprn): # TODO: Do something about this! continue filtered_df = uprn_filenames[ @@ -914,13 +1089,473 @@ def compile_data(): property_attributes = complete_epcs.merge( spatial_data_to_uprn, - how="left", + how="inner", on="uprn" ) # We drop the columns we don't care about for clustering property_attributes = property_attributes.drop( columns=[ - + "address", + "uprn-source", + "heating-cost-potential", + "hot-water-cost-potential", + "potential-energy-rating", + "environment-impact-potential", + "address3", + "local-authority-label", + "sheating-energy-eff", + "local-authority-label", + "county", + "postcode", + "constituency", + "co2-emissions-potential", + "energy-consumption-potential", + "local-authority", + "inspection-date", + "address1", + "constituency-label", + "building-reference-number", + "floor-energy-eff", + "address2", + "posttown", + "floor-env-eff", + "sheating-env-eff", + "lighting-cost-potential", + "main-heating-controls", + "transaction-type", + "uprn", + "lodgement-date", + "lmk-key", + "wind-turbine-count", + "tenure", + "potential-energy-efficiency", ] ) + + # Fields to transform: lodgement-datetime + property_attributes["days_since_last_epc"] = ( + datetime.now() - pd.to_datetime(property_attributes["lodgement-datetime"]) + ).dt.days + + property_attributes = property_attributes.drop(columns=["lodgement-datetime"]) + + # Up to: + # Round averages to nearest integer + fill_with_average = [ + "low-energy-fixed-light-count", + "floor-height", + "heating-cost-current", + "fixed-lighting-outlets-count", + "hot-water-cost-current", + "number-heated-rooms", + "co2-emiss-curr-per-floor-area", + "total-floor-area", + "environment-impact-current", + "co2-emissions-current", + "number-habitable-rooms", + "energy-consumption-current", + 'lighting-cost-current', + "low_energy_lighting", + ] + + fill_with_mode = [ + "multi-glaze-proportion", + "extension-count", + ] + + fill_with_zero = [ + "unheated-corridor-length", + "number-open-fireplaces", + "glazed-area", + "photo-supply", + ] + + fill_with_categorical = { + "construction-age-band": "unknown", + "mainheat-energy-eff": "N/A", + "windows-env-eff": "N/A", + "lighting-energy-eff": "N/A", + "energy-tariff": 'NO DATA!', + "mechanical-ventilation": 'NO DATA!', + "solar-water-heating-flag": "N", + "mains-gas-flag": "N", + "heat-loss-corridor": "unknown", + "flat-storey-count": "Not a flat", + "roof-energy-eff": "N/A", + "hot-water-env-eff": "N/A", + "mainheatc-energy-eff": "N/A", + "main-fuel": 'NO DATA!', + "lighting-env-eff": "N/A", + "windows-energy-eff": "N/A", + "roof-env-eff": "N/A", + "walls-env-eff": "N/A", + "mainheat-env-eff": "N/A", + "flat-top-storey": "N", + "mainheatc-env-eff": "N", + "floor-level": "NODATA!", + "hot-water-energy-eff": "N/A", + } + + # Consolidation columns to single value + consolidation_columns = { + "glazed-type": {"from": ['', 'NO DATA!', 'not defined', 'INVALID!'], "to": "unknown"}, + "mechanical-ventilation": {"from": ['', 'NO DATA!', 'not defined', 'INVALID!'], "to": "unknown"}, + "solar-water-heating-flag": {"from": [''], "to": "N"}, + "mains-gas-flag": {"from": [''], "to": "N"}, + "heat-loss-corridor": {"from": ['NO DATA!', ''], "to": "N"}, + "flat-top-storey": {"from": [''], "to": "N"}, + "floor-level": {"from": [""], "to": "NODATA!"} + } + + +def concatenate_row(row): + return ', '.join(row.dropna().replace('', None).dropna().astype(str)) + + +def compile_data_final(): + # Updated version: + + """ + Various data sources have been produced to create the final data source for Stonewater. + This function combines them + :return: + """ + ######################################################################## + # Read in data + ######################################################################## + + asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Downloads/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx", header=4 + ) + + udprn_data = pd.read_excel( + "/Users/khalimconn-kowlessar/Downloads/UDPRN updated RA Sample for 5 year programme.xlsx", header=0 + )[["AddressId", "UDPRN"]].rename(columns={"AddressId": "Address ID"}) + udprn_data["UDPRN"] = udprn_data["UDPRN"].astype("Int64").astype(str) + udprn_data["Address ID"] = udprn_data["Address ID"].astype(str) + + asset_list = asset_list.merge(udprn_data, how="inner", on="Address ID") + asset_list = asset_list.rename(columns={"UDPRN": "udprn"}) + + # Read in the lookups + uprn_lookup_1 = pd.DataFrame(json.loads(read_from_s3( + bucket_name="retrofit-data-dev", + s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup.json" + ))) + uprn_lookup_1["match_type"] = "Exact" + + uprn_lookup_2 = pd.DataFrame(json.loads(read_from_s3( + bucket_name="retrofit-data-dev", + s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup_2.json" + ))) + uprn_lookup_2 = uprn_lookup_2.rename( + columns={ + "epc_address": "standardised_address", + "epc_postcode": "standardised_postcode" + } + ) + uprn_lookup_2["match_type"] = "EPC" + + uprn_lookup_3 = pd.DataFrame(json.loads(read_from_s3( + bucket_name="retrofit-data-dev", + s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json" + ))) + uprn_lookup_3["standardised_address"] = uprn_lookup_3[["line_1", "line_2", "line_3", "district", "postcode"]].apply( + concatenate_row, axis=1 + ) + uprn_lookup_3 = uprn_lookup_3[ + ["udprn", "uprn", "standardised_address", "postcode"] + ].rename(columns={"postcode": "standardised_postcode"}) + uprn_lookup_3["match_type"] = "Exact" + + uprn_lookup_4_basis = pd.read_csv("manual_fix_uprns-populated.csv", index_col=False) + uprn_lookup_4_basis["os_option_1_uprn"] = uprn_lookup_4_basis["os_option_1_uprn"].astype(str) + uprn_lookup_4_basis["os_option_2_uprn"] = uprn_lookup_4_basis["os_option_2_uprn"].astype("Int64").astype(str) + # prepare lookup 4 + uprn_lookup_4 = [] + for _, x in uprn_lookup_4_basis.iterrows(): + + property_type = None + built_form = None + if x["option"] == 1: + uprn = x["os_option_1_uprn"] + standardised_address = x["os_option_1_address"] + postcode = x["os_option_1_postcode"] + elif x["option"] == 2: + uprn = x["os_option_2_uprn"] + standardised_address = x["os_option_2_address"] + postcode = x["os_option_2_postcode"] + else: + uprn = x["manual_uprn"] + standardised_address = x["manual_address"] + postcode = x["manual_postcode"] + + uprn_lookup_4.append( + { + "internal_id": x["internal_id"], + "external_address_id": x["external_address_id"], + "uprn": uprn, + "standardised_address": standardised_address, + "standardised_postcode": postcode, + "property_type": property_type, + "built_form": built_form + } + ) + uprn_lookup_4 = pd.DataFrame(uprn_lookup_4) + uprn_lookup_4["match_type"] = "Fuzzy" + + # concat + uprn_lookup = pd.concat([uprn_lookup_1, uprn_lookup_2]) + + # We now merge all of the UPRNs onto the asset list + assert len(uprn_lookup) + len(uprn_lookup_3) + len(uprn_lookup_4) == len(asset_list) + + epc_data = json.loads( + read_from_s3( + bucket_name="retrofit-data-dev", + s3_file_name="customers/Stonewater/clustering/epc_data.json" + ) + ) + epc_data = pd.DataFrame(epc_data) + + # We drop come EPCS + epc_data = epc_data[epc_data["internal_id"].isin(uprn_lookup_2["internal_id"].values)] + + # This we can use to produce additional variables such as number of old surveys + # older_epc_data = json.loads( + # read_from_s3( + # bucket_name="retrofit-data-dev", + # s3_file_name="customers/Stonewater/clustering/old_epc_data.json" + # ) + # ) + # older_epc_data = {k: v for k, v in older_epc_data.items() if k not in internal_id_epcs_to_drop} + + ######################################################################## + # Prepare asset list + ######################################################################## + + # Keep just the columns we're interested in + asset_list = asset_list[ + [ + "Osm. ID", + "Org. ref.", + "Postcode", + "House no", + "Name", + "Address line 2", + "City/Town", + "County", + "Address ID", # This is not uprn + "udprn" + ] + ].rename( + columns={ + "Osm. ID": "internal_id", + "Org. ref.": "customer_asset_id", + "Postcode": "postcode", + "House no": "house_number", + "Name": "address1", + "Address line 2": "address2", + "City/Town": "city_town", + "County": "county", + "Address ID": "external_address_id", + } + ) + + # Create full address + asset_list["full_address"] = np.where( + ~pd.isnull(asset_list["address2"]), + ( + asset_list["address1"] + ", " + + asset_list["address2"] + ", " + + asset_list["city_town"].str.title() + ", " + + # asset_list["county"] + ", " + + asset_list["postcode"] + ), + asset_list["address1"] + ", " + + asset_list["city_town"].str.title() + ", " + + # asset_list["county"] + ", " + + asset_list["postcode"] + ) + + if pd.isnull(asset_list["full_address"]).sum(): + raise ValueError("Missing full addresses") + + # Final preps of lookups + uprn_lookup_3["udprn"] = uprn_lookup_3["udprn"].astype(str) + uprn_lookup_3 = uprn_lookup_3.merge( + asset_list[["udprn", "internal_id", "external_address_id"]], how="left", on="udprn" + ) + uprn_lookup = pd.concat([ + uprn_lookup, + uprn_lookup_3, + uprn_lookup_4 + ]) + uprn_lookup["external_address_id"] = uprn_lookup["external_address_id"].astype(str) + + asset_list = asset_list.merge( + uprn_lookup.drop(columns=["udprn"]), + how="inner", + on=["internal_id", "external_address_id"] + ) + + # This is everything without a uprn + missing_uprn = asset_list[pd.isnull(asset_list["uprn"])] + + missing_uprn_with_udprn = missing_uprn[ + missing_uprn["udprn"] != "" + ].reset_index(drop=True) + + missing_uprn_without_udprn = missing_uprn[ + missing_uprn["udprn"] == "" + ].reset_index(drop=True) + + missing_uprn_without_udprn = missing_uprn_without_udprn[["internal_id", "external_address_id", "full_address"]] + # Pull in the best ordnance survey data for each one and manually fix + manua_fix = [] + for _, x in missing_uprn_without_udprn.iterrows(): + internal_id = x["internal_id"] + + os_option_1_address = "" + os_option_1_postcode = "" + os_option_1_uprn = "" + if internal_id in os_most_relevant_1_internal_ids: + p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id] + os_option_1_address = p_os_data["ADDRESS"].values[0] + os_option_1_postcode = p_os_data["POSTCODE"].values[0] + os_option_1_uprn = p_os_data["UPRN"].values[0] + + os_option_2_address = "" + os_option_2_postcode = "" + os_option_2_uprn = "" + if internal_id in os_most_relevant_2_internal_ids: + p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id] + os_option_2_address = p_os_data["ADDRESS"].values[0] + os_option_2_postcode = p_os_data["POSTCODE"].values[0] + os_option_2_uprn = p_os_data["UPRN"].values[0] + + manua_fix.append( + { + **x.to_dict(), + "os_option_1_address": os_option_1_address, + "os_option_1_postcode": os_option_1_postcode, + "os_option_1_uprn": os_option_1_uprn, + + "os_option_2_address": os_option_2_address, + "os_option_2_postcode": os_option_2_postcode, + "os_option_2_uprn": os_option_2_uprn, + } + ) + + manua_fix = pd.DataFrame(manua_fix) + # manua_fix.to_csv("manual_fix_uprns.csv") + + # Split into chunks of 200 + api_key = "ak_lxcapii7HnEhGKxuVmPquzTYKu9vp" + import requests + import time + completed_id = 0 + + uprn_to_udprn = [] + for row_index, data in tqdm(missing_uprn_with_udprn.iterrows(), total=len(missing_uprn_with_udprn)): + if row_index < completed_id: + continue + time.sleep(0.5) + + # Call the API + udprn = data["udprn"] + + url = f"https://api.ideal-postcodes.co.uk/v1/udprn/{udprn}?api_key={api_key}" + + payload = { + "api_key": api_key + } + headers = { + 'Accept': 'application/json' + } + + response = requests.request("GET", url, headers=headers, data=payload) + if response.status_code != 200: + raise ValueError("API call dead") + + result = response.json() + uprn_to_udprn.append( + result["result"] + ) + completed_id += 1 + + # Store in S3 + # save_data_to_s3( + # data=json.dumps(uprn_to_udprn), + # s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json", + # bucket_name="retrofit-data-dev" + # ) + + test = read_from_s3( + s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json", + bucket_name="retrofit-data-dev" + ) + test = pd.DataFrame(json.loads(test)) + + for _, x in missing_uprn.iterrows(): + udprn = x["udprn"] + udprn = None if udprn == "" else udprn + internal_id = x["internal_id"] + + is_flat = "flat" in x["address1"].lower() + # Get the OS data + final_os_data = pd.DataFrame() + if internal_id in os_most_relevant_1_internal_ids: + p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id] + p_os_data_all = os_all_1[str(internal_id)] + final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn, is_flat) + + if (internal_id in os_most_relevant_2_internal_ids) and final_os_data.empty: + p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id] + p_os_data_all = os_all_2[str(internal_id)] + final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn, is_flat) + + # Try signing up on a free trial with these guys! + # https://ideal-postcodes.co.uk/pricing + # API example: https://docs.ideal-postcodes.co.uk/docs/api/udprn + + if final_os_data.empty: + boo + continue + + if final_os_data.shape[0] != 1: + if final_os_data["UPRN"].nunique() > 1: + raise Exception("Investigate me") + + # TODO: We should do a different variation of similarity, where we strip out "Flat" and "Room x" if they are there + # This is the first ordnance survey data pull + os_most_relevant_1 = [] + os_all_1 = {} + for i in tqdm(["1", "2", "3"]): + most_relevant_segment = read_from_s3( + bucket_name="retrofit-data-dev", + s3_file_name=f"customers/Stonewater/clustering/os_most_relevant_{i}.json" + ) + os_most_relevant_1.extend(json.loads(most_relevant_segment)) + os_all_segment = read_from_s3( + bucket_name="retrofit-data-dev", + s3_file_name=f"customers/Stonewater/clustering/os_all_{i}.json" + ) + os_all_1 = {**os_all_1, **json.loads(os_all_segment)} + + os_most_relevant_1 = pd.DataFrame(os_most_relevant_1) + + # This is the second ordnance survey data pull + os_most_relevant_2 = read_from_s3( + bucket_name="retrofit-data-dev", + s3_file_name="customers/Stonewater/clustering/problematic_os.json" + ) + os_most_relevant_2 = json.loads(os_most_relevant_2) + os_most_relevant_2 = pd.DataFrame(os_most_relevant_2) + + os_all_2 = read_from_s3( + bucket_name="retrofit-data-dev", + s3_file_name="customers/Stonewater/clustering/problematic_os_all.json" + ) + os_all_2 = json.loads(os_all_2)