mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
merging asset list with uprns for stonewater
This commit is contained in:
parent
667ed1b990
commit
5e84967ee0
1 changed files with 660 additions and 25 deletions
|
|
@ -5,6 +5,7 @@ from dotenv import load_dotenv
|
|||
from backend.SearchEpc import SearchEpc
|
||||
import urllib.parse
|
||||
import requests
|
||||
from datetime import datetime
|
||||
|
||||
from fuzzywuzzy import fuzz
|
||||
import numpy as np
|
||||
|
|
@ -631,6 +632,23 @@ def app():
|
|||
# "Address ID": "external_address_id",
|
||||
|
||||
|
||||
def filter_os_data(p_os_data, p_os_data_all, udprn, is_flat):
|
||||
if udprn is None:
|
||||
p_os_data_all = pd.DataFrame([z["DPA"] if "DPA" in z else z["LPI"] for z in p_os_data_all])
|
||||
if is_flat:
|
||||
p_os_data_all = p_os_data_all[p_os_data_all["CLASSIFICATION_CODE"] == "RD06"]
|
||||
return p_os_data_all.head(1)
|
||||
|
||||
return p_os_data_all.head(1)
|
||||
|
||||
final_os_data = p_os_data[p_os_data["UDPRN"] == udprn]
|
||||
if final_os_data.empty:
|
||||
p_os_data_all = pd.DataFrame([z["DPA"] if "DPA" in z else z["LPI"] for z in p_os_data_all])
|
||||
final_os_data = p_os_data_all[p_os_data_all["UDPRN"].astype(str) == udprn]
|
||||
|
||||
return final_os_data
|
||||
|
||||
|
||||
def compile_data():
|
||||
"""
|
||||
Various data sources have been produced to create the final data source for Stonewater.
|
||||
|
|
@ -640,13 +658,53 @@ def compile_data():
|
|||
########################################################################
|
||||
# Read in data
|
||||
########################################################################
|
||||
asset_list = read_excel_from_s3(
|
||||
file_key="customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
|
||||
bucket_name="retrofit-data-dev",
|
||||
header_row=4
|
||||
# asset_list = read_excel_from_s3(
|
||||
# file_key="customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
|
||||
# bucket_name="retrofit-data-dev",
|
||||
# header_row=4
|
||||
# )
|
||||
#
|
||||
# udprn_data = read_excel_from_s3(
|
||||
# file_key="customers/Stonewater/UDPRN updated RA Sample for 5 year programme.xlsx",
|
||||
# bucket_name="retrofit-data-dev",
|
||||
# header_row=0
|
||||
# )[["AddressId", "UDPRN"]].rename(columns={"AddressId": "external_address_id"})
|
||||
|
||||
asset_list = pd.read_excel(
|
||||
"/Users/khalimconn-kowlessar/Downloads/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx", header=4
|
||||
)
|
||||
|
||||
udprn_data = pd.read_excel(
|
||||
"/Users/khalimconn-kowlessar/Downloads/UDPRN updated RA Sample for 5 year programme.xlsx", header=0
|
||||
)[["AddressId", "UDPRN"]].rename(columns={"AddressId": "Address ID"})
|
||||
udprn_data["UDPRN"] = udprn_data["UDPRN"].astype("Int64").astype(str)
|
||||
udprn_data["Address ID"] = udprn_data["Address ID"].astype(str)
|
||||
|
||||
asset_list = asset_list.merge(udprn_data, how="inner", on="Address ID")
|
||||
asset_list = asset_list.rename(columns={"UDPRN": "udprn"})
|
||||
|
||||
# Read in the lookups
|
||||
uprn_lookup_1 = pd.DataFrame(json.loads(read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup.json"
|
||||
)))
|
||||
|
||||
uprn_lookup_2 = pd.DataFrame(json.loads(read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup_2.json"
|
||||
)))
|
||||
uprn_lookup_2 = uprn_lookup_2.rename(
|
||||
columns={
|
||||
"epc_address": "standardised_address",
|
||||
"epc_postcode": "standardised_postcode"
|
||||
}
|
||||
)
|
||||
|
||||
# concat
|
||||
uprn_lookup = pd.concat([uprn_lookup_1, uprn_lookup_2])
|
||||
|
||||
# TODO: Read in UPRNs or UDPRN
|
||||
# UPRN LOOKUPS TO READ IN: address_uprn_udprn_lookup, address_uprn_udprn_lookup_2
|
||||
|
||||
epc_data = json.loads(
|
||||
read_from_s3(
|
||||
|
|
@ -660,13 +718,13 @@ def compile_data():
|
|||
epc_data = epc_data[~epc_data["internal_id"].isin(internal_id_epcs_to_drop)]
|
||||
|
||||
# This we can use to produce additional variables such as number of old surveys
|
||||
older_epc_data = json.loads(
|
||||
read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name="customers/Stonewater/clustering/old_epc_data.json"
|
||||
)
|
||||
)
|
||||
older_epc_data = {k: v for k, v in older_epc_data.items() if k not in internal_id_epcs_to_drop}
|
||||
# older_epc_data = json.loads(
|
||||
# read_from_s3(
|
||||
# bucket_name="retrofit-data-dev",
|
||||
# s3_file_name="customers/Stonewater/clustering/old_epc_data.json"
|
||||
# )
|
||||
# )
|
||||
# older_epc_data = {k: v for k, v in older_epc_data.items() if k not in internal_id_epcs_to_drop}
|
||||
|
||||
# This is the first ordnance survey data pull
|
||||
os_most_relevant_1 = []
|
||||
|
|
@ -703,8 +761,6 @@ def compile_data():
|
|||
# Prepare asset list
|
||||
########################################################################
|
||||
# TODO: Merge on UPRNs
|
||||
# Drop the bottom 4 rows, which are completely missing
|
||||
asset_list = asset_list.head(-4)
|
||||
|
||||
# Keep just the columns we're interested in
|
||||
asset_list = asset_list[
|
||||
|
|
@ -718,6 +774,7 @@ def compile_data():
|
|||
"City/Town",
|
||||
"County",
|
||||
"Address ID", # This is not uprn
|
||||
"udprn"
|
||||
]
|
||||
].rename(
|
||||
columns={
|
||||
|
|
@ -752,8 +809,17 @@ def compile_data():
|
|||
if pd.isnull(asset_list["full_address"]).sum():
|
||||
raise ValueError("Missing full addresses")
|
||||
|
||||
# Merge on UDPRN
|
||||
|
||||
asset_list = asset_list.merge(
|
||||
uprn_lookup.drop(columns=["udprn"]), how="left", on=["internal_id", "external_address_id"]
|
||||
)
|
||||
|
||||
# This is everything without a uprn
|
||||
|
||||
# Quick check to see if we have os data for every property that doesn't have an EPC
|
||||
without_epc = asset_list[~asset_list["internal_id"].isin(epc_data["internal_id"].values)]
|
||||
|
||||
os_most_relevant_1_internal_ids = os_most_relevant_1["internal_id"].tolist()
|
||||
os_most_relevant_2_internal_ids = os_most_relevant_2["internal_id"].tolist()
|
||||
|
||||
|
|
@ -773,14 +839,124 @@ def compile_data():
|
|||
if len(missing_os_data):
|
||||
raise Exception("We don't have SOME data for each internal_id")
|
||||
|
||||
# For the EPC data, some of them are missing UPRN
|
||||
epc_data_to_address = asset_list[
|
||||
asset_list["internal_id"].isin(epc_data["internal_id"].values)
|
||||
][
|
||||
["full_address", "internal_id"]].merge(
|
||||
epc_data, how="left", on="internal_id"
|
||||
# Let's create a lookup table of internal_id, external_address_id, udprn, uprn, standardised_address
|
||||
address_uprn_udprn_lookup = []
|
||||
for _, x in without_epc.iterrows():
|
||||
if pd.isnull(x["UDPRN"]):
|
||||
continue
|
||||
udprn = str(int(x["UDPRN"]))
|
||||
internal_id = x["internal_id"]
|
||||
|
||||
is_flat = "flat" in x["address1"].lower()
|
||||
|
||||
# Get the OS data
|
||||
final_os_data = pd.DataFrame()
|
||||
if internal_id in os_most_relevant_1_internal_ids:
|
||||
p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id]
|
||||
p_os_data_all = os_all_1[str(internal_id)]
|
||||
final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn, is_flat)
|
||||
|
||||
if (internal_id in os_most_relevant_2_internal_ids) and final_os_data.empty:
|
||||
p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id]
|
||||
p_os_data_all = os_all_2[str(internal_id)]
|
||||
|
||||
final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn, is_flat)
|
||||
|
||||
if final_os_data.empty:
|
||||
continue
|
||||
|
||||
if final_os_data.shape[0] != 1:
|
||||
if final_os_data["UPRN"].nunique() > 1:
|
||||
raise Exception("Investigate me")
|
||||
|
||||
address_uprn_udprn_lookup.append(
|
||||
{
|
||||
"internal_id": internal_id,
|
||||
"external_address_id": x["external_address_id"],
|
||||
"udprn": udprn,
|
||||
"uprn": final_os_data["UPRN"].values[0],
|
||||
"standardised_address": final_os_data["ADDRESS"].values[0],
|
||||
"standardised_postcode": final_os_data["POSTCODE"].values[0]
|
||||
}
|
||||
)
|
||||
|
||||
# Store this lookup
|
||||
# save_data_to_s3(
|
||||
# data=json.dumps(address_uprn_udprn_lookup),
|
||||
# s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup.json",
|
||||
# bucket_name="retrofit-data-dev"
|
||||
# )
|
||||
|
||||
address_uprn_udprn_lookup = pd.DataFrame(address_uprn_udprn_lookup)
|
||||
missed = asset_list[~asset_list["internal_id"].isin(address_uprn_udprn_lookup["internal_id"].values)]
|
||||
|
||||
address_comparison = (
|
||||
asset_list[
|
||||
["internal_id", "external_address_id", "UDPRN", "full_address", "postcode", "house_number", "address1"]
|
||||
].merge(
|
||||
epc_data[["internal_id", "address", "postcode", "address1", "uprn"]].rename(
|
||||
columns={
|
||||
"address": "epc_address",
|
||||
"postcode": "epc_postcode",
|
||||
"address1": "epc_address1"
|
||||
}
|
||||
),
|
||||
how="inner",
|
||||
on="internal_id"
|
||||
)
|
||||
)
|
||||
missed_uprn = epc_data_to_address[epc_data_to_address["uprn"] == ""]
|
||||
|
||||
address_comparison["address_similarity_score"] = address_comparison.apply(
|
||||
lambda x: fuzz.ratio(
|
||||
remove_commas_and_full_stops(x["address1"].lower() + x["postcode"].lower()),
|
||||
remove_commas_and_full_stops(x["epc_address1"].lower() + x["epc_postcode"].lower())
|
||||
),
|
||||
axis=1
|
||||
)
|
||||
address_comparison = address_comparison.sort_values("address_similarity_score", ascending=False)
|
||||
# Cond
|
||||
confident = address_comparison[address_comparison["address_similarity_score"] >= 95]
|
||||
low_confidence = address_comparison[address_comparison["address_similarity_score"] < 95].copy()
|
||||
|
||||
lookup_2 = confident[
|
||||
[
|
||||
'internal_id', 'external_address_id', 'UDPRN', 'uprn',
|
||||
'epc_address', 'epc_postcode']
|
||||
].rename(columns={"UDPRN": "udprn"})
|
||||
|
||||
# Store in S3
|
||||
# save_data_to_s3(
|
||||
# data=json.dumps(lookup_2.to_dict("records")),
|
||||
# s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup_2.json",
|
||||
# bucket_name="retrofit-data-dev"
|
||||
# )
|
||||
|
||||
# Need to deal with the low confidence records
|
||||
low_confidence_asset_list = asset_list[asset_list["internal_id"].isin(low_confidence["internal_id"])]
|
||||
for _, x in low_confidence_asset_list.iterrows():
|
||||
udprn = str(int(x["UDPRN"]))
|
||||
internal_id = x["internal_id"]
|
||||
# Get the OS data
|
||||
final_os_data = pd.DataFrame()
|
||||
if internal_id in os_most_relevant_1_internal_ids:
|
||||
p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id]
|
||||
p_os_data_all = os_all_1[str(internal_id)]
|
||||
final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn)
|
||||
|
||||
if (internal_id in os_most_relevant_2_internal_ids) and final_os_data.empty:
|
||||
p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id]
|
||||
p_os_data_all = os_all_2[str(internal_id)]
|
||||
|
||||
final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn)
|
||||
|
||||
# For the EPC data, some of them are missing UPRN
|
||||
epc_data = epc_data.merge(missing_uprn_map, how="left", on="internal_id")
|
||||
epc_data["uprn"] = np.where(
|
||||
epc_data["uprn"] == "",
|
||||
epc_data["mapped_uprn"],
|
||||
epc_data["uprn"]
|
||||
)
|
||||
epc_data = epc_data.drop(columns=["mapped_uprn"])
|
||||
|
||||
# Once we have UPRNs, we might want to pull in the EPC data again
|
||||
# epc_data_with_uprn = []
|
||||
|
|
@ -864,8 +1040,7 @@ def compile_data():
|
|||
]
|
||||
)
|
||||
|
||||
# We now pull additional data
|
||||
uprns = complete_epcs["uprn"].tolist()
|
||||
# We now pull spatial data
|
||||
# We get the spatial file list and loop through each EPC and determine which file it needs.
|
||||
# We then just read in the files that we need and get the data, for each uprn from that file
|
||||
|
||||
|
|
@ -875,7 +1050,7 @@ def compile_data():
|
|||
|
||||
uprn_lookup = {}
|
||||
for uprn in complete_epcs["uprn"]:
|
||||
if not uprn:
|
||||
if pd.isnull(uprn):
|
||||
# TODO: Do something about this!
|
||||
continue
|
||||
filtered_df = uprn_filenames[
|
||||
|
|
@ -914,13 +1089,473 @@ def compile_data():
|
|||
|
||||
property_attributes = complete_epcs.merge(
|
||||
spatial_data_to_uprn,
|
||||
how="left",
|
||||
how="inner",
|
||||
on="uprn"
|
||||
)
|
||||
|
||||
# We drop the columns we don't care about for clustering
|
||||
property_attributes = property_attributes.drop(
|
||||
columns=[
|
||||
|
||||
"address",
|
||||
"uprn-source",
|
||||
"heating-cost-potential",
|
||||
"hot-water-cost-potential",
|
||||
"potential-energy-rating",
|
||||
"environment-impact-potential",
|
||||
"address3",
|
||||
"local-authority-label",
|
||||
"sheating-energy-eff",
|
||||
"local-authority-label",
|
||||
"county",
|
||||
"postcode",
|
||||
"constituency",
|
||||
"co2-emissions-potential",
|
||||
"energy-consumption-potential",
|
||||
"local-authority",
|
||||
"inspection-date",
|
||||
"address1",
|
||||
"constituency-label",
|
||||
"building-reference-number",
|
||||
"floor-energy-eff",
|
||||
"address2",
|
||||
"posttown",
|
||||
"floor-env-eff",
|
||||
"sheating-env-eff",
|
||||
"lighting-cost-potential",
|
||||
"main-heating-controls",
|
||||
"transaction-type",
|
||||
"uprn",
|
||||
"lodgement-date",
|
||||
"lmk-key",
|
||||
"wind-turbine-count",
|
||||
"tenure",
|
||||
"potential-energy-efficiency",
|
||||
]
|
||||
)
|
||||
|
||||
# Fields to transform: lodgement-datetime
|
||||
property_attributes["days_since_last_epc"] = (
|
||||
datetime.now() - pd.to_datetime(property_attributes["lodgement-datetime"])
|
||||
).dt.days
|
||||
|
||||
property_attributes = property_attributes.drop(columns=["lodgement-datetime"])
|
||||
|
||||
# Up to:
|
||||
# Round averages to nearest integer
|
||||
fill_with_average = [
|
||||
"low-energy-fixed-light-count",
|
||||
"floor-height",
|
||||
"heating-cost-current",
|
||||
"fixed-lighting-outlets-count",
|
||||
"hot-water-cost-current",
|
||||
"number-heated-rooms",
|
||||
"co2-emiss-curr-per-floor-area",
|
||||
"total-floor-area",
|
||||
"environment-impact-current",
|
||||
"co2-emissions-current",
|
||||
"number-habitable-rooms",
|
||||
"energy-consumption-current",
|
||||
'lighting-cost-current',
|
||||
"low_energy_lighting",
|
||||
]
|
||||
|
||||
fill_with_mode = [
|
||||
"multi-glaze-proportion",
|
||||
"extension-count",
|
||||
]
|
||||
|
||||
fill_with_zero = [
|
||||
"unheated-corridor-length",
|
||||
"number-open-fireplaces",
|
||||
"glazed-area",
|
||||
"photo-supply",
|
||||
]
|
||||
|
||||
fill_with_categorical = {
|
||||
"construction-age-band": "unknown",
|
||||
"mainheat-energy-eff": "N/A",
|
||||
"windows-env-eff": "N/A",
|
||||
"lighting-energy-eff": "N/A",
|
||||
"energy-tariff": 'NO DATA!',
|
||||
"mechanical-ventilation": 'NO DATA!',
|
||||
"solar-water-heating-flag": "N",
|
||||
"mains-gas-flag": "N",
|
||||
"heat-loss-corridor": "unknown",
|
||||
"flat-storey-count": "Not a flat",
|
||||
"roof-energy-eff": "N/A",
|
||||
"hot-water-env-eff": "N/A",
|
||||
"mainheatc-energy-eff": "N/A",
|
||||
"main-fuel": 'NO DATA!',
|
||||
"lighting-env-eff": "N/A",
|
||||
"windows-energy-eff": "N/A",
|
||||
"roof-env-eff": "N/A",
|
||||
"walls-env-eff": "N/A",
|
||||
"mainheat-env-eff": "N/A",
|
||||
"flat-top-storey": "N",
|
||||
"mainheatc-env-eff": "N",
|
||||
"floor-level": "NODATA!",
|
||||
"hot-water-energy-eff": "N/A",
|
||||
}
|
||||
|
||||
# Consolidation columns to single value
|
||||
consolidation_columns = {
|
||||
"glazed-type": {"from": ['', 'NO DATA!', 'not defined', 'INVALID!'], "to": "unknown"},
|
||||
"mechanical-ventilation": {"from": ['', 'NO DATA!', 'not defined', 'INVALID!'], "to": "unknown"},
|
||||
"solar-water-heating-flag": {"from": [''], "to": "N"},
|
||||
"mains-gas-flag": {"from": [''], "to": "N"},
|
||||
"heat-loss-corridor": {"from": ['NO DATA!', ''], "to": "N"},
|
||||
"flat-top-storey": {"from": [''], "to": "N"},
|
||||
"floor-level": {"from": [""], "to": "NODATA!"}
|
||||
}
|
||||
|
||||
|
||||
def concatenate_row(row):
|
||||
return ', '.join(row.dropna().replace('', None).dropna().astype(str))
|
||||
|
||||
|
||||
def compile_data_final():
|
||||
# Updated version:
|
||||
|
||||
"""
|
||||
Various data sources have been produced to create the final data source for Stonewater.
|
||||
This function combines them
|
||||
:return:
|
||||
"""
|
||||
########################################################################
|
||||
# Read in data
|
||||
########################################################################
|
||||
|
||||
asset_list = pd.read_excel(
|
||||
"/Users/khalimconn-kowlessar/Downloads/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx", header=4
|
||||
)
|
||||
|
||||
udprn_data = pd.read_excel(
|
||||
"/Users/khalimconn-kowlessar/Downloads/UDPRN updated RA Sample for 5 year programme.xlsx", header=0
|
||||
)[["AddressId", "UDPRN"]].rename(columns={"AddressId": "Address ID"})
|
||||
udprn_data["UDPRN"] = udprn_data["UDPRN"].astype("Int64").astype(str)
|
||||
udprn_data["Address ID"] = udprn_data["Address ID"].astype(str)
|
||||
|
||||
asset_list = asset_list.merge(udprn_data, how="inner", on="Address ID")
|
||||
asset_list = asset_list.rename(columns={"UDPRN": "udprn"})
|
||||
|
||||
# Read in the lookups
|
||||
uprn_lookup_1 = pd.DataFrame(json.loads(read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup.json"
|
||||
)))
|
||||
uprn_lookup_1["match_type"] = "Exact"
|
||||
|
||||
uprn_lookup_2 = pd.DataFrame(json.loads(read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup_2.json"
|
||||
)))
|
||||
uprn_lookup_2 = uprn_lookup_2.rename(
|
||||
columns={
|
||||
"epc_address": "standardised_address",
|
||||
"epc_postcode": "standardised_postcode"
|
||||
}
|
||||
)
|
||||
uprn_lookup_2["match_type"] = "EPC"
|
||||
|
||||
uprn_lookup_3 = pd.DataFrame(json.loads(read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json"
|
||||
)))
|
||||
uprn_lookup_3["standardised_address"] = uprn_lookup_3[["line_1", "line_2", "line_3", "district", "postcode"]].apply(
|
||||
concatenate_row, axis=1
|
||||
)
|
||||
uprn_lookup_3 = uprn_lookup_3[
|
||||
["udprn", "uprn", "standardised_address", "postcode"]
|
||||
].rename(columns={"postcode": "standardised_postcode"})
|
||||
uprn_lookup_3["match_type"] = "Exact"
|
||||
|
||||
uprn_lookup_4_basis = pd.read_csv("manual_fix_uprns-populated.csv", index_col=False)
|
||||
uprn_lookup_4_basis["os_option_1_uprn"] = uprn_lookup_4_basis["os_option_1_uprn"].astype(str)
|
||||
uprn_lookup_4_basis["os_option_2_uprn"] = uprn_lookup_4_basis["os_option_2_uprn"].astype("Int64").astype(str)
|
||||
# prepare lookup 4
|
||||
uprn_lookup_4 = []
|
||||
for _, x in uprn_lookup_4_basis.iterrows():
|
||||
|
||||
property_type = None
|
||||
built_form = None
|
||||
if x["option"] == 1:
|
||||
uprn = x["os_option_1_uprn"]
|
||||
standardised_address = x["os_option_1_address"]
|
||||
postcode = x["os_option_1_postcode"]
|
||||
elif x["option"] == 2:
|
||||
uprn = x["os_option_2_uprn"]
|
||||
standardised_address = x["os_option_2_address"]
|
||||
postcode = x["os_option_2_postcode"]
|
||||
else:
|
||||
uprn = x["manual_uprn"]
|
||||
standardised_address = x["manual_address"]
|
||||
postcode = x["manual_postcode"]
|
||||
|
||||
uprn_lookup_4.append(
|
||||
{
|
||||
"internal_id": x["internal_id"],
|
||||
"external_address_id": x["external_address_id"],
|
||||
"uprn": uprn,
|
||||
"standardised_address": standardised_address,
|
||||
"standardised_postcode": postcode,
|
||||
"property_type": property_type,
|
||||
"built_form": built_form
|
||||
}
|
||||
)
|
||||
uprn_lookup_4 = pd.DataFrame(uprn_lookup_4)
|
||||
uprn_lookup_4["match_type"] = "Fuzzy"
|
||||
|
||||
# concat
|
||||
uprn_lookup = pd.concat([uprn_lookup_1, uprn_lookup_2])
|
||||
|
||||
# We now merge all of the UPRNs onto the asset list
|
||||
assert len(uprn_lookup) + len(uprn_lookup_3) + len(uprn_lookup_4) == len(asset_list)
|
||||
|
||||
epc_data = json.loads(
|
||||
read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name="customers/Stonewater/clustering/epc_data.json"
|
||||
)
|
||||
)
|
||||
epc_data = pd.DataFrame(epc_data)
|
||||
|
||||
# We drop come EPCS
|
||||
epc_data = epc_data[epc_data["internal_id"].isin(uprn_lookup_2["internal_id"].values)]
|
||||
|
||||
# This we can use to produce additional variables such as number of old surveys
|
||||
# older_epc_data = json.loads(
|
||||
# read_from_s3(
|
||||
# bucket_name="retrofit-data-dev",
|
||||
# s3_file_name="customers/Stonewater/clustering/old_epc_data.json"
|
||||
# )
|
||||
# )
|
||||
# older_epc_data = {k: v for k, v in older_epc_data.items() if k not in internal_id_epcs_to_drop}
|
||||
|
||||
########################################################################
|
||||
# Prepare asset list
|
||||
########################################################################
|
||||
|
||||
# Keep just the columns we're interested in
|
||||
asset_list = asset_list[
|
||||
[
|
||||
"Osm. ID",
|
||||
"Org. ref.",
|
||||
"Postcode",
|
||||
"House no",
|
||||
"Name",
|
||||
"Address line 2",
|
||||
"City/Town",
|
||||
"County",
|
||||
"Address ID", # This is not uprn
|
||||
"udprn"
|
||||
]
|
||||
].rename(
|
||||
columns={
|
||||
"Osm. ID": "internal_id",
|
||||
"Org. ref.": "customer_asset_id",
|
||||
"Postcode": "postcode",
|
||||
"House no": "house_number",
|
||||
"Name": "address1",
|
||||
"Address line 2": "address2",
|
||||
"City/Town": "city_town",
|
||||
"County": "county",
|
||||
"Address ID": "external_address_id",
|
||||
}
|
||||
)
|
||||
|
||||
# Create full address
|
||||
asset_list["full_address"] = np.where(
|
||||
~pd.isnull(asset_list["address2"]),
|
||||
(
|
||||
asset_list["address1"] + ", " +
|
||||
asset_list["address2"] + ", " +
|
||||
asset_list["city_town"].str.title() + ", " +
|
||||
# asset_list["county"] + ", " +
|
||||
asset_list["postcode"]
|
||||
),
|
||||
asset_list["address1"] + ", " +
|
||||
asset_list["city_town"].str.title() + ", " +
|
||||
# asset_list["county"] + ", " +
|
||||
asset_list["postcode"]
|
||||
)
|
||||
|
||||
if pd.isnull(asset_list["full_address"]).sum():
|
||||
raise ValueError("Missing full addresses")
|
||||
|
||||
# Final preps of lookups
|
||||
uprn_lookup_3["udprn"] = uprn_lookup_3["udprn"].astype(str)
|
||||
uprn_lookup_3 = uprn_lookup_3.merge(
|
||||
asset_list[["udprn", "internal_id", "external_address_id"]], how="left", on="udprn"
|
||||
)
|
||||
uprn_lookup = pd.concat([
|
||||
uprn_lookup,
|
||||
uprn_lookup_3,
|
||||
uprn_lookup_4
|
||||
])
|
||||
uprn_lookup["external_address_id"] = uprn_lookup["external_address_id"].astype(str)
|
||||
|
||||
asset_list = asset_list.merge(
|
||||
uprn_lookup.drop(columns=["udprn"]),
|
||||
how="inner",
|
||||
on=["internal_id", "external_address_id"]
|
||||
)
|
||||
|
||||
# This is everything without a uprn
|
||||
missing_uprn = asset_list[pd.isnull(asset_list["uprn"])]
|
||||
|
||||
missing_uprn_with_udprn = missing_uprn[
|
||||
missing_uprn["udprn"] != "<NA>"
|
||||
].reset_index(drop=True)
|
||||
|
||||
missing_uprn_without_udprn = missing_uprn[
|
||||
missing_uprn["udprn"] == "<NA>"
|
||||
].reset_index(drop=True)
|
||||
|
||||
missing_uprn_without_udprn = missing_uprn_without_udprn[["internal_id", "external_address_id", "full_address"]]
|
||||
# Pull in the best ordnance survey data for each one and manually fix
|
||||
manua_fix = []
|
||||
for _, x in missing_uprn_without_udprn.iterrows():
|
||||
internal_id = x["internal_id"]
|
||||
|
||||
os_option_1_address = ""
|
||||
os_option_1_postcode = ""
|
||||
os_option_1_uprn = ""
|
||||
if internal_id in os_most_relevant_1_internal_ids:
|
||||
p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id]
|
||||
os_option_1_address = p_os_data["ADDRESS"].values[0]
|
||||
os_option_1_postcode = p_os_data["POSTCODE"].values[0]
|
||||
os_option_1_uprn = p_os_data["UPRN"].values[0]
|
||||
|
||||
os_option_2_address = ""
|
||||
os_option_2_postcode = ""
|
||||
os_option_2_uprn = ""
|
||||
if internal_id in os_most_relevant_2_internal_ids:
|
||||
p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id]
|
||||
os_option_2_address = p_os_data["ADDRESS"].values[0]
|
||||
os_option_2_postcode = p_os_data["POSTCODE"].values[0]
|
||||
os_option_2_uprn = p_os_data["UPRN"].values[0]
|
||||
|
||||
manua_fix.append(
|
||||
{
|
||||
**x.to_dict(),
|
||||
"os_option_1_address": os_option_1_address,
|
||||
"os_option_1_postcode": os_option_1_postcode,
|
||||
"os_option_1_uprn": os_option_1_uprn,
|
||||
|
||||
"os_option_2_address": os_option_2_address,
|
||||
"os_option_2_postcode": os_option_2_postcode,
|
||||
"os_option_2_uprn": os_option_2_uprn,
|
||||
}
|
||||
)
|
||||
|
||||
manua_fix = pd.DataFrame(manua_fix)
|
||||
# manua_fix.to_csv("manual_fix_uprns.csv")
|
||||
|
||||
# Split into chunks of 200
|
||||
api_key = "ak_lxcapii7HnEhGKxuVmPquzTYKu9vp"
|
||||
import requests
|
||||
import time
|
||||
completed_id = 0
|
||||
|
||||
uprn_to_udprn = []
|
||||
for row_index, data in tqdm(missing_uprn_with_udprn.iterrows(), total=len(missing_uprn_with_udprn)):
|
||||
if row_index < completed_id:
|
||||
continue
|
||||
time.sleep(0.5)
|
||||
|
||||
# Call the API
|
||||
udprn = data["udprn"]
|
||||
|
||||
url = f"https://api.ideal-postcodes.co.uk/v1/udprn/{udprn}?api_key={api_key}"
|
||||
|
||||
payload = {
|
||||
"api_key": api_key
|
||||
}
|
||||
headers = {
|
||||
'Accept': 'application/json'
|
||||
}
|
||||
|
||||
response = requests.request("GET", url, headers=headers, data=payload)
|
||||
if response.status_code != 200:
|
||||
raise ValueError("API call dead")
|
||||
|
||||
result = response.json()
|
||||
uprn_to_udprn.append(
|
||||
result["result"]
|
||||
)
|
||||
completed_id += 1
|
||||
|
||||
# Store in S3
|
||||
# save_data_to_s3(
|
||||
# data=json.dumps(uprn_to_udprn),
|
||||
# s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json",
|
||||
# bucket_name="retrofit-data-dev"
|
||||
# )
|
||||
|
||||
test = read_from_s3(
|
||||
s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json",
|
||||
bucket_name="retrofit-data-dev"
|
||||
)
|
||||
test = pd.DataFrame(json.loads(test))
|
||||
|
||||
for _, x in missing_uprn.iterrows():
|
||||
udprn = x["udprn"]
|
||||
udprn = None if udprn == "<NA>" else udprn
|
||||
internal_id = x["internal_id"]
|
||||
|
||||
is_flat = "flat" in x["address1"].lower()
|
||||
# Get the OS data
|
||||
final_os_data = pd.DataFrame()
|
||||
if internal_id in os_most_relevant_1_internal_ids:
|
||||
p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id]
|
||||
p_os_data_all = os_all_1[str(internal_id)]
|
||||
final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn, is_flat)
|
||||
|
||||
if (internal_id in os_most_relevant_2_internal_ids) and final_os_data.empty:
|
||||
p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id]
|
||||
p_os_data_all = os_all_2[str(internal_id)]
|
||||
final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn, is_flat)
|
||||
|
||||
# Try signing up on a free trial with these guys!
|
||||
# https://ideal-postcodes.co.uk/pricing
|
||||
# API example: https://docs.ideal-postcodes.co.uk/docs/api/udprn
|
||||
|
||||
if final_os_data.empty:
|
||||
boo
|
||||
continue
|
||||
|
||||
if final_os_data.shape[0] != 1:
|
||||
if final_os_data["UPRN"].nunique() > 1:
|
||||
raise Exception("Investigate me")
|
||||
|
||||
# TODO: We should do a different variation of similarity, where we strip out "Flat" and "Room x" if they are there
|
||||
# This is the first ordnance survey data pull
|
||||
os_most_relevant_1 = []
|
||||
os_all_1 = {}
|
||||
for i in tqdm(["1", "2", "3"]):
|
||||
most_relevant_segment = read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name=f"customers/Stonewater/clustering/os_most_relevant_{i}.json"
|
||||
)
|
||||
os_most_relevant_1.extend(json.loads(most_relevant_segment))
|
||||
os_all_segment = read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name=f"customers/Stonewater/clustering/os_all_{i}.json"
|
||||
)
|
||||
os_all_1 = {**os_all_1, **json.loads(os_all_segment)}
|
||||
|
||||
os_most_relevant_1 = pd.DataFrame(os_most_relevant_1)
|
||||
|
||||
# This is the second ordnance survey data pull
|
||||
os_most_relevant_2 = read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name="customers/Stonewater/clustering/problematic_os.json"
|
||||
)
|
||||
os_most_relevant_2 = json.loads(os_most_relevant_2)
|
||||
os_most_relevant_2 = pd.DataFrame(os_most_relevant_2)
|
||||
|
||||
os_all_2 = read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name="customers/Stonewater/clustering/problematic_os_all.json"
|
||||
)
|
||||
os_all_2 = json.loads(os_all_2)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue