merging asset list with uprns for stonewater

This commit is contained in:
Khalim Conn-Kowlessar 2024-06-13 00:26:22 +01:00
parent 667ed1b990
commit 5e84967ee0

View file

@ -5,6 +5,7 @@ from dotenv import load_dotenv
from backend.SearchEpc import SearchEpc
import urllib.parse
import requests
from datetime import datetime
from fuzzywuzzy import fuzz
import numpy as np
@ -631,6 +632,23 @@ def app():
# "Address ID": "external_address_id",
def filter_os_data(p_os_data, p_os_data_all, udprn, is_flat):
if udprn is None:
p_os_data_all = pd.DataFrame([z["DPA"] if "DPA" in z else z["LPI"] for z in p_os_data_all])
if is_flat:
p_os_data_all = p_os_data_all[p_os_data_all["CLASSIFICATION_CODE"] == "RD06"]
return p_os_data_all.head(1)
return p_os_data_all.head(1)
final_os_data = p_os_data[p_os_data["UDPRN"] == udprn]
if final_os_data.empty:
p_os_data_all = pd.DataFrame([z["DPA"] if "DPA" in z else z["LPI"] for z in p_os_data_all])
final_os_data = p_os_data_all[p_os_data_all["UDPRN"].astype(str) == udprn]
return final_os_data
def compile_data():
"""
Various data sources have been produced to create the final data source for Stonewater.
@ -640,13 +658,53 @@ def compile_data():
########################################################################
# Read in data
########################################################################
asset_list = read_excel_from_s3(
file_key="customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
bucket_name="retrofit-data-dev",
header_row=4
# asset_list = read_excel_from_s3(
# file_key="customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
# bucket_name="retrofit-data-dev",
# header_row=4
# )
#
# udprn_data = read_excel_from_s3(
# file_key="customers/Stonewater/UDPRN updated RA Sample for 5 year programme.xlsx",
# bucket_name="retrofit-data-dev",
# header_row=0
# )[["AddressId", "UDPRN"]].rename(columns={"AddressId": "external_address_id"})
asset_list = pd.read_excel(
"/Users/khalimconn-kowlessar/Downloads/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx", header=4
)
udprn_data = pd.read_excel(
"/Users/khalimconn-kowlessar/Downloads/UDPRN updated RA Sample for 5 year programme.xlsx", header=0
)[["AddressId", "UDPRN"]].rename(columns={"AddressId": "Address ID"})
udprn_data["UDPRN"] = udprn_data["UDPRN"].astype("Int64").astype(str)
udprn_data["Address ID"] = udprn_data["Address ID"].astype(str)
asset_list = asset_list.merge(udprn_data, how="inner", on="Address ID")
asset_list = asset_list.rename(columns={"UDPRN": "udprn"})
# Read in the lookups
uprn_lookup_1 = pd.DataFrame(json.loads(read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup.json"
)))
uprn_lookup_2 = pd.DataFrame(json.loads(read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup_2.json"
)))
uprn_lookup_2 = uprn_lookup_2.rename(
columns={
"epc_address": "standardised_address",
"epc_postcode": "standardised_postcode"
}
)
# concat
uprn_lookup = pd.concat([uprn_lookup_1, uprn_lookup_2])
# TODO: Read in UPRNs or UDPRN
# UPRN LOOKUPS TO READ IN: address_uprn_udprn_lookup, address_uprn_udprn_lookup_2
epc_data = json.loads(
read_from_s3(
@ -660,13 +718,13 @@ def compile_data():
epc_data = epc_data[~epc_data["internal_id"].isin(internal_id_epcs_to_drop)]
# This we can use to produce additional variables such as number of old surveys
older_epc_data = json.loads(
read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/old_epc_data.json"
)
)
older_epc_data = {k: v for k, v in older_epc_data.items() if k not in internal_id_epcs_to_drop}
# older_epc_data = json.loads(
# read_from_s3(
# bucket_name="retrofit-data-dev",
# s3_file_name="customers/Stonewater/clustering/old_epc_data.json"
# )
# )
# older_epc_data = {k: v for k, v in older_epc_data.items() if k not in internal_id_epcs_to_drop}
# This is the first ordnance survey data pull
os_most_relevant_1 = []
@ -703,8 +761,6 @@ def compile_data():
# Prepare asset list
########################################################################
# TODO: Merge on UPRNs
# Drop the bottom 4 rows, which are completely missing
asset_list = asset_list.head(-4)
# Keep just the columns we're interested in
asset_list = asset_list[
@ -718,6 +774,7 @@ def compile_data():
"City/Town",
"County",
"Address ID", # This is not uprn
"udprn"
]
].rename(
columns={
@ -752,8 +809,17 @@ def compile_data():
if pd.isnull(asset_list["full_address"]).sum():
raise ValueError("Missing full addresses")
# Merge on UDPRN
asset_list = asset_list.merge(
uprn_lookup.drop(columns=["udprn"]), how="left", on=["internal_id", "external_address_id"]
)
# This is everything without a uprn
# Quick check to see if we have os data for every property that doesn't have an EPC
without_epc = asset_list[~asset_list["internal_id"].isin(epc_data["internal_id"].values)]
os_most_relevant_1_internal_ids = os_most_relevant_1["internal_id"].tolist()
os_most_relevant_2_internal_ids = os_most_relevant_2["internal_id"].tolist()
@ -773,14 +839,124 @@ def compile_data():
if len(missing_os_data):
raise Exception("We don't have SOME data for each internal_id")
# For the EPC data, some of them are missing UPRN
epc_data_to_address = asset_list[
asset_list["internal_id"].isin(epc_data["internal_id"].values)
][
["full_address", "internal_id"]].merge(
epc_data, how="left", on="internal_id"
# Let's create a lookup table of internal_id, external_address_id, udprn, uprn, standardised_address
address_uprn_udprn_lookup = []
for _, x in without_epc.iterrows():
if pd.isnull(x["UDPRN"]):
continue
udprn = str(int(x["UDPRN"]))
internal_id = x["internal_id"]
is_flat = "flat" in x["address1"].lower()
# Get the OS data
final_os_data = pd.DataFrame()
if internal_id in os_most_relevant_1_internal_ids:
p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id]
p_os_data_all = os_all_1[str(internal_id)]
final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn, is_flat)
if (internal_id in os_most_relevant_2_internal_ids) and final_os_data.empty:
p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id]
p_os_data_all = os_all_2[str(internal_id)]
final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn, is_flat)
if final_os_data.empty:
continue
if final_os_data.shape[0] != 1:
if final_os_data["UPRN"].nunique() > 1:
raise Exception("Investigate me")
address_uprn_udprn_lookup.append(
{
"internal_id": internal_id,
"external_address_id": x["external_address_id"],
"udprn": udprn,
"uprn": final_os_data["UPRN"].values[0],
"standardised_address": final_os_data["ADDRESS"].values[0],
"standardised_postcode": final_os_data["POSTCODE"].values[0]
}
)
# Store this lookup
# save_data_to_s3(
# data=json.dumps(address_uprn_udprn_lookup),
# s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup.json",
# bucket_name="retrofit-data-dev"
# )
address_uprn_udprn_lookup = pd.DataFrame(address_uprn_udprn_lookup)
missed = asset_list[~asset_list["internal_id"].isin(address_uprn_udprn_lookup["internal_id"].values)]
address_comparison = (
asset_list[
["internal_id", "external_address_id", "UDPRN", "full_address", "postcode", "house_number", "address1"]
].merge(
epc_data[["internal_id", "address", "postcode", "address1", "uprn"]].rename(
columns={
"address": "epc_address",
"postcode": "epc_postcode",
"address1": "epc_address1"
}
),
how="inner",
on="internal_id"
)
)
missed_uprn = epc_data_to_address[epc_data_to_address["uprn"] == ""]
address_comparison["address_similarity_score"] = address_comparison.apply(
lambda x: fuzz.ratio(
remove_commas_and_full_stops(x["address1"].lower() + x["postcode"].lower()),
remove_commas_and_full_stops(x["epc_address1"].lower() + x["epc_postcode"].lower())
),
axis=1
)
address_comparison = address_comparison.sort_values("address_similarity_score", ascending=False)
# Cond
confident = address_comparison[address_comparison["address_similarity_score"] >= 95]
low_confidence = address_comparison[address_comparison["address_similarity_score"] < 95].copy()
lookup_2 = confident[
[
'internal_id', 'external_address_id', 'UDPRN', 'uprn',
'epc_address', 'epc_postcode']
].rename(columns={"UDPRN": "udprn"})
# Store in S3
# save_data_to_s3(
# data=json.dumps(lookup_2.to_dict("records")),
# s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup_2.json",
# bucket_name="retrofit-data-dev"
# )
# Need to deal with the low confidence records
low_confidence_asset_list = asset_list[asset_list["internal_id"].isin(low_confidence["internal_id"])]
for _, x in low_confidence_asset_list.iterrows():
udprn = str(int(x["UDPRN"]))
internal_id = x["internal_id"]
# Get the OS data
final_os_data = pd.DataFrame()
if internal_id in os_most_relevant_1_internal_ids:
p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id]
p_os_data_all = os_all_1[str(internal_id)]
final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn)
if (internal_id in os_most_relevant_2_internal_ids) and final_os_data.empty:
p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id]
p_os_data_all = os_all_2[str(internal_id)]
final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn)
# For the EPC data, some of them are missing UPRN
epc_data = epc_data.merge(missing_uprn_map, how="left", on="internal_id")
epc_data["uprn"] = np.where(
epc_data["uprn"] == "",
epc_data["mapped_uprn"],
epc_data["uprn"]
)
epc_data = epc_data.drop(columns=["mapped_uprn"])
# Once we have UPRNs, we might want to pull in the EPC data again
# epc_data_with_uprn = []
@ -864,8 +1040,7 @@ def compile_data():
]
)
# We now pull additional data
uprns = complete_epcs["uprn"].tolist()
# We now pull spatial data
# We get the spatial file list and loop through each EPC and determine which file it needs.
# We then just read in the files that we need and get the data, for each uprn from that file
@ -875,7 +1050,7 @@ def compile_data():
uprn_lookup = {}
for uprn in complete_epcs["uprn"]:
if not uprn:
if pd.isnull(uprn):
# TODO: Do something about this!
continue
filtered_df = uprn_filenames[
@ -914,13 +1089,473 @@ def compile_data():
property_attributes = complete_epcs.merge(
spatial_data_to_uprn,
how="left",
how="inner",
on="uprn"
)
# We drop the columns we don't care about for clustering
property_attributes = property_attributes.drop(
columns=[
"address",
"uprn-source",
"heating-cost-potential",
"hot-water-cost-potential",
"potential-energy-rating",
"environment-impact-potential",
"address3",
"local-authority-label",
"sheating-energy-eff",
"local-authority-label",
"county",
"postcode",
"constituency",
"co2-emissions-potential",
"energy-consumption-potential",
"local-authority",
"inspection-date",
"address1",
"constituency-label",
"building-reference-number",
"floor-energy-eff",
"address2",
"posttown",
"floor-env-eff",
"sheating-env-eff",
"lighting-cost-potential",
"main-heating-controls",
"transaction-type",
"uprn",
"lodgement-date",
"lmk-key",
"wind-turbine-count",
"tenure",
"potential-energy-efficiency",
]
)
# Fields to transform: lodgement-datetime
property_attributes["days_since_last_epc"] = (
datetime.now() - pd.to_datetime(property_attributes["lodgement-datetime"])
).dt.days
property_attributes = property_attributes.drop(columns=["lodgement-datetime"])
# Up to:
# Round averages to nearest integer
fill_with_average = [
"low-energy-fixed-light-count",
"floor-height",
"heating-cost-current",
"fixed-lighting-outlets-count",
"hot-water-cost-current",
"number-heated-rooms",
"co2-emiss-curr-per-floor-area",
"total-floor-area",
"environment-impact-current",
"co2-emissions-current",
"number-habitable-rooms",
"energy-consumption-current",
'lighting-cost-current',
"low_energy_lighting",
]
fill_with_mode = [
"multi-glaze-proportion",
"extension-count",
]
fill_with_zero = [
"unheated-corridor-length",
"number-open-fireplaces",
"glazed-area",
"photo-supply",
]
fill_with_categorical = {
"construction-age-band": "unknown",
"mainheat-energy-eff": "N/A",
"windows-env-eff": "N/A",
"lighting-energy-eff": "N/A",
"energy-tariff": 'NO DATA!',
"mechanical-ventilation": 'NO DATA!',
"solar-water-heating-flag": "N",
"mains-gas-flag": "N",
"heat-loss-corridor": "unknown",
"flat-storey-count": "Not a flat",
"roof-energy-eff": "N/A",
"hot-water-env-eff": "N/A",
"mainheatc-energy-eff": "N/A",
"main-fuel": 'NO DATA!',
"lighting-env-eff": "N/A",
"windows-energy-eff": "N/A",
"roof-env-eff": "N/A",
"walls-env-eff": "N/A",
"mainheat-env-eff": "N/A",
"flat-top-storey": "N",
"mainheatc-env-eff": "N",
"floor-level": "NODATA!",
"hot-water-energy-eff": "N/A",
}
# Consolidation columns to single value
consolidation_columns = {
"glazed-type": {"from": ['', 'NO DATA!', 'not defined', 'INVALID!'], "to": "unknown"},
"mechanical-ventilation": {"from": ['', 'NO DATA!', 'not defined', 'INVALID!'], "to": "unknown"},
"solar-water-heating-flag": {"from": [''], "to": "N"},
"mains-gas-flag": {"from": [''], "to": "N"},
"heat-loss-corridor": {"from": ['NO DATA!', ''], "to": "N"},
"flat-top-storey": {"from": [''], "to": "N"},
"floor-level": {"from": [""], "to": "NODATA!"}
}
def concatenate_row(row):
return ', '.join(row.dropna().replace('', None).dropna().astype(str))
def compile_data_final():
# Updated version:
"""
Various data sources have been produced to create the final data source for Stonewater.
This function combines them
:return:
"""
########################################################################
# Read in data
########################################################################
asset_list = pd.read_excel(
"/Users/khalimconn-kowlessar/Downloads/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx", header=4
)
udprn_data = pd.read_excel(
"/Users/khalimconn-kowlessar/Downloads/UDPRN updated RA Sample for 5 year programme.xlsx", header=0
)[["AddressId", "UDPRN"]].rename(columns={"AddressId": "Address ID"})
udprn_data["UDPRN"] = udprn_data["UDPRN"].astype("Int64").astype(str)
udprn_data["Address ID"] = udprn_data["Address ID"].astype(str)
asset_list = asset_list.merge(udprn_data, how="inner", on="Address ID")
asset_list = asset_list.rename(columns={"UDPRN": "udprn"})
# Read in the lookups
uprn_lookup_1 = pd.DataFrame(json.loads(read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup.json"
)))
uprn_lookup_1["match_type"] = "Exact"
uprn_lookup_2 = pd.DataFrame(json.loads(read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup_2.json"
)))
uprn_lookup_2 = uprn_lookup_2.rename(
columns={
"epc_address": "standardised_address",
"epc_postcode": "standardised_postcode"
}
)
uprn_lookup_2["match_type"] = "EPC"
uprn_lookup_3 = pd.DataFrame(json.loads(read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json"
)))
uprn_lookup_3["standardised_address"] = uprn_lookup_3[["line_1", "line_2", "line_3", "district", "postcode"]].apply(
concatenate_row, axis=1
)
uprn_lookup_3 = uprn_lookup_3[
["udprn", "uprn", "standardised_address", "postcode"]
].rename(columns={"postcode": "standardised_postcode"})
uprn_lookup_3["match_type"] = "Exact"
uprn_lookup_4_basis = pd.read_csv("manual_fix_uprns-populated.csv", index_col=False)
uprn_lookup_4_basis["os_option_1_uprn"] = uprn_lookup_4_basis["os_option_1_uprn"].astype(str)
uprn_lookup_4_basis["os_option_2_uprn"] = uprn_lookup_4_basis["os_option_2_uprn"].astype("Int64").astype(str)
# prepare lookup 4
uprn_lookup_4 = []
for _, x in uprn_lookup_4_basis.iterrows():
property_type = None
built_form = None
if x["option"] == 1:
uprn = x["os_option_1_uprn"]
standardised_address = x["os_option_1_address"]
postcode = x["os_option_1_postcode"]
elif x["option"] == 2:
uprn = x["os_option_2_uprn"]
standardised_address = x["os_option_2_address"]
postcode = x["os_option_2_postcode"]
else:
uprn = x["manual_uprn"]
standardised_address = x["manual_address"]
postcode = x["manual_postcode"]
uprn_lookup_4.append(
{
"internal_id": x["internal_id"],
"external_address_id": x["external_address_id"],
"uprn": uprn,
"standardised_address": standardised_address,
"standardised_postcode": postcode,
"property_type": property_type,
"built_form": built_form
}
)
uprn_lookup_4 = pd.DataFrame(uprn_lookup_4)
uprn_lookup_4["match_type"] = "Fuzzy"
# concat
uprn_lookup = pd.concat([uprn_lookup_1, uprn_lookup_2])
# We now merge all of the UPRNs onto the asset list
assert len(uprn_lookup) + len(uprn_lookup_3) + len(uprn_lookup_4) == len(asset_list)
epc_data = json.loads(
read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/epc_data.json"
)
)
epc_data = pd.DataFrame(epc_data)
# We drop come EPCS
epc_data = epc_data[epc_data["internal_id"].isin(uprn_lookup_2["internal_id"].values)]
# This we can use to produce additional variables such as number of old surveys
# older_epc_data = json.loads(
# read_from_s3(
# bucket_name="retrofit-data-dev",
# s3_file_name="customers/Stonewater/clustering/old_epc_data.json"
# )
# )
# older_epc_data = {k: v for k, v in older_epc_data.items() if k not in internal_id_epcs_to_drop}
########################################################################
# Prepare asset list
########################################################################
# Keep just the columns we're interested in
asset_list = asset_list[
[
"Osm. ID",
"Org. ref.",
"Postcode",
"House no",
"Name",
"Address line 2",
"City/Town",
"County",
"Address ID", # This is not uprn
"udprn"
]
].rename(
columns={
"Osm. ID": "internal_id",
"Org. ref.": "customer_asset_id",
"Postcode": "postcode",
"House no": "house_number",
"Name": "address1",
"Address line 2": "address2",
"City/Town": "city_town",
"County": "county",
"Address ID": "external_address_id",
}
)
# Create full address
asset_list["full_address"] = np.where(
~pd.isnull(asset_list["address2"]),
(
asset_list["address1"] + ", " +
asset_list["address2"] + ", " +
asset_list["city_town"].str.title() + ", " +
# asset_list["county"] + ", " +
asset_list["postcode"]
),
asset_list["address1"] + ", " +
asset_list["city_town"].str.title() + ", " +
# asset_list["county"] + ", " +
asset_list["postcode"]
)
if pd.isnull(asset_list["full_address"]).sum():
raise ValueError("Missing full addresses")
# Final preps of lookups
uprn_lookup_3["udprn"] = uprn_lookup_3["udprn"].astype(str)
uprn_lookup_3 = uprn_lookup_3.merge(
asset_list[["udprn", "internal_id", "external_address_id"]], how="left", on="udprn"
)
uprn_lookup = pd.concat([
uprn_lookup,
uprn_lookup_3,
uprn_lookup_4
])
uprn_lookup["external_address_id"] = uprn_lookup["external_address_id"].astype(str)
asset_list = asset_list.merge(
uprn_lookup.drop(columns=["udprn"]),
how="inner",
on=["internal_id", "external_address_id"]
)
# This is everything without a uprn
missing_uprn = asset_list[pd.isnull(asset_list["uprn"])]
missing_uprn_with_udprn = missing_uprn[
missing_uprn["udprn"] != "<NA>"
].reset_index(drop=True)
missing_uprn_without_udprn = missing_uprn[
missing_uprn["udprn"] == "<NA>"
].reset_index(drop=True)
missing_uprn_without_udprn = missing_uprn_without_udprn[["internal_id", "external_address_id", "full_address"]]
# Pull in the best ordnance survey data for each one and manually fix
manua_fix = []
for _, x in missing_uprn_without_udprn.iterrows():
internal_id = x["internal_id"]
os_option_1_address = ""
os_option_1_postcode = ""
os_option_1_uprn = ""
if internal_id in os_most_relevant_1_internal_ids:
p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id]
os_option_1_address = p_os_data["ADDRESS"].values[0]
os_option_1_postcode = p_os_data["POSTCODE"].values[0]
os_option_1_uprn = p_os_data["UPRN"].values[0]
os_option_2_address = ""
os_option_2_postcode = ""
os_option_2_uprn = ""
if internal_id in os_most_relevant_2_internal_ids:
p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id]
os_option_2_address = p_os_data["ADDRESS"].values[0]
os_option_2_postcode = p_os_data["POSTCODE"].values[0]
os_option_2_uprn = p_os_data["UPRN"].values[0]
manua_fix.append(
{
**x.to_dict(),
"os_option_1_address": os_option_1_address,
"os_option_1_postcode": os_option_1_postcode,
"os_option_1_uprn": os_option_1_uprn,
"os_option_2_address": os_option_2_address,
"os_option_2_postcode": os_option_2_postcode,
"os_option_2_uprn": os_option_2_uprn,
}
)
manua_fix = pd.DataFrame(manua_fix)
# manua_fix.to_csv("manual_fix_uprns.csv")
# Split into chunks of 200
api_key = "ak_lxcapii7HnEhGKxuVmPquzTYKu9vp"
import requests
import time
completed_id = 0
uprn_to_udprn = []
for row_index, data in tqdm(missing_uprn_with_udprn.iterrows(), total=len(missing_uprn_with_udprn)):
if row_index < completed_id:
continue
time.sleep(0.5)
# Call the API
udprn = data["udprn"]
url = f"https://api.ideal-postcodes.co.uk/v1/udprn/{udprn}?api_key={api_key}"
payload = {
"api_key": api_key
}
headers = {
'Accept': 'application/json'
}
response = requests.request("GET", url, headers=headers, data=payload)
if response.status_code != 200:
raise ValueError("API call dead")
result = response.json()
uprn_to_udprn.append(
result["result"]
)
completed_id += 1
# Store in S3
# save_data_to_s3(
# data=json.dumps(uprn_to_udprn),
# s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json",
# bucket_name="retrofit-data-dev"
# )
test = read_from_s3(
s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json",
bucket_name="retrofit-data-dev"
)
test = pd.DataFrame(json.loads(test))
for _, x in missing_uprn.iterrows():
udprn = x["udprn"]
udprn = None if udprn == "<NA>" else udprn
internal_id = x["internal_id"]
is_flat = "flat" in x["address1"].lower()
# Get the OS data
final_os_data = pd.DataFrame()
if internal_id in os_most_relevant_1_internal_ids:
p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id]
p_os_data_all = os_all_1[str(internal_id)]
final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn, is_flat)
if (internal_id in os_most_relevant_2_internal_ids) and final_os_data.empty:
p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id]
p_os_data_all = os_all_2[str(internal_id)]
final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn, is_flat)
# Try signing up on a free trial with these guys!
# https://ideal-postcodes.co.uk/pricing
# API example: https://docs.ideal-postcodes.co.uk/docs/api/udprn
if final_os_data.empty:
boo
continue
if final_os_data.shape[0] != 1:
if final_os_data["UPRN"].nunique() > 1:
raise Exception("Investigate me")
# TODO: We should do a different variation of similarity, where we strip out "Flat" and "Room x" if they are there
# This is the first ordnance survey data pull
os_most_relevant_1 = []
os_all_1 = {}
for i in tqdm(["1", "2", "3"]):
most_relevant_segment = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name=f"customers/Stonewater/clustering/os_most_relevant_{i}.json"
)
os_most_relevant_1.extend(json.loads(most_relevant_segment))
os_all_segment = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name=f"customers/Stonewater/clustering/os_all_{i}.json"
)
os_all_1 = {**os_all_1, **json.loads(os_all_segment)}
os_most_relevant_1 = pd.DataFrame(os_most_relevant_1)
# This is the second ordnance survey data pull
os_most_relevant_2 = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/problematic_os.json"
)
os_most_relevant_2 = json.loads(os_most_relevant_2)
os_most_relevant_2 = pd.DataFrame(os_most_relevant_2)
os_all_2 = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/problematic_os_all.json"
)
os_all_2 = json.loads(os_all_2)