corrected spelling of built forms

This commit is contained in:
Khalim Conn-Kowlessar 2024-06-13 01:13:19 +01:00
parent 5e84967ee0
commit 6f9a78cabc
2 changed files with 106 additions and 121 deletions

View file

@ -117,8 +117,8 @@ class OrdnanceSuveyClient:
value_map = { value_map = {
# In the OS api, "RD" is a "Dwelling" however this is not valid property type in the EPC database # In the OS api, "RD" is a "Dwelling" however this is not valid property type in the EPC database
'RD': {}, 'RD': {},
'RD02': {'property_type': 'House', 'built_form': 'Detatched'}, 'RD02': {'property_type': 'House', 'built_form': 'Detached'},
'RD03': {'property_type': 'House', 'built_form': 'Semi-Detatched'}, 'RD03': {'property_type': 'House', 'built_form': 'Semi-Detached'},
'RD04': {'property_type': 'House', 'built_form': 'Mid-Terrace'}, 'RD04': {'property_type': 'House', 'built_form': 'Mid-Terrace'},
'RD06': {'property_type': 'Flat'}, 'RD06': {'property_type': 'Flat'},
} }

View file

@ -1285,7 +1285,7 @@ def compile_data_final():
elif x["option"] == 2: elif x["option"] == 2:
uprn = x["os_option_2_uprn"] uprn = x["os_option_2_uprn"]
standardised_address = x["os_option_2_address"] standardised_address = x["os_option_2_address"]
postcode = x["os_option_2_postcode"] postcode = x["os_option_2_address"].split(", ")[-1]
else: else:
uprn = x["manual_uprn"] uprn = x["manual_uprn"]
standardised_address = x["manual_address"] standardised_address = x["manual_address"]
@ -1347,7 +1347,8 @@ def compile_data_final():
"City/Town", "City/Town",
"County", "County",
"Address ID", # This is not uprn "Address ID", # This is not uprn
"udprn" "udprn",
"Owning body"
] ]
].rename( ].rename(
columns={ columns={
@ -1360,6 +1361,7 @@ def compile_data_final():
"City/Town": "city_town", "City/Town": "city_town",
"County": "county", "County": "county",
"Address ID": "external_address_id", "Address ID": "external_address_id",
"Owning body": "owner"
} }
) )
@ -1400,59 +1402,117 @@ def compile_data_final():
on=["internal_id", "external_address_id"] on=["internal_id", "external_address_id"]
) )
# This is everything without a uprn # Store locally
missing_uprn = asset_list[pd.isnull(asset_list["uprn"])] # asset_list.to_excel("Stonewater asset list with uprn.xlsx")
missing_uprn_with_udprn = missing_uprn[ # We take just domestic properties
missing_uprn["udprn"] != "<NA>"
].reset_index(drop=True)
missing_uprn_without_udprn = missing_uprn[ # This is the first ordnance survey data pull
missing_uprn["udprn"] == "<NA>" os_most_relevant_1 = []
].reset_index(drop=True) os_all_1 = {}
for i in tqdm(["1", "2", "3"]):
most_relevant_segment = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name=f"customers/Stonewater/clustering/os_most_relevant_{i}.json"
)
os_most_relevant_1.extend(json.loads(most_relevant_segment))
os_all_segment = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name=f"customers/Stonewater/clustering/os_all_{i}.json"
)
os_all_1 = {**os_all_1, **json.loads(os_all_segment)}
missing_uprn_without_udprn = missing_uprn_without_udprn[["internal_id", "external_address_id", "full_address"]] os_most_relevant_1 = pd.DataFrame(os_most_relevant_1)
# Pull in the best ordnance survey data for each one and manually fix
manua_fix = []
for _, x in missing_uprn_without_udprn.iterrows():
internal_id = x["internal_id"]
os_option_1_address = "" # This is the second ordnance survey data pull
os_option_1_postcode = "" os_most_relevant_2 = read_from_s3(
os_option_1_uprn = "" bucket_name="retrofit-data-dev",
if internal_id in os_most_relevant_1_internal_ids: s3_file_name="customers/Stonewater/clustering/problematic_os.json"
p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id] )
os_option_1_address = p_os_data["ADDRESS"].values[0] os_most_relevant_2 = json.loads(os_most_relevant_2)
os_option_1_postcode = p_os_data["POSTCODE"].values[0] os_most_relevant_2 = pd.DataFrame(os_most_relevant_2)
os_option_1_uprn = p_os_data["UPRN"].values[0]
os_option_2_address = "" os_all_2 = read_from_s3(
os_option_2_postcode = "" bucket_name="retrofit-data-dev",
os_option_2_uprn = "" s3_file_name="customers/Stonewater/clustering/problematic_os_all.json"
if internal_id in os_most_relevant_2_internal_ids: )
p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id] os_all_2 = json.loads(os_all_2)
os_option_2_address = p_os_data["ADDRESS"].values[0]
os_option_2_postcode = p_os_data["POSTCODE"].values[0]
os_option_2_uprn = p_os_data["UPRN"].values[0]
manua_fix.append( needs_epc_data = asset_list[~asset_list["internal_id"].isin(epc_data["internal_id"])]
os_1_ids = os_most_relevant_1["internal_id"].values
os_2_ids = os_most_relevant_2["internal_id"].values
epc_data_batch_2 = []
older_epcs_batch_2 = {}
for _, property in tqdm(needs_epc_data.iterrows(), total=len(needs_epc_data)):
if pd.isnull(property["uprn"]):
continue
searcher = SearchEpc(
address1=", ".join(property["standardised_address"].split(", ")[:-1]),
postcode=property["standardised_postcode"],
auth_token=EPC_AUTH_TOKEN,
os_api_key="",
full_address=property["standardised_address"],
uprn=property["uprn"]
)
searcher.find_property(skip_os=True)
if searcher.newest_epc is None and property["match_type"] == "Exact":
# Estimate!
# Get the OS data
p_os_df = pd.DataFrame()
if property["internal_id"] in os_1_ids:
p_os_df = pd.DataFrame(
[x["DPA"] if "DPA" in x else x["LPI"] for x in os_all_1[str(property["internal_id"])]]
)
p_os_df = p_os_df[p_os_df["UPRN"].astype(str) == property["uprn"]]
if p_os_df.empty:
p_os_df = pd.DataFrame(
[x["DPA"] if "DPA" in x else x["LPI"] for x in os_all_2[str(property["internal_id"])]]
)
p_os_df = p_os_df[p_os_df["UPRN"] == property["uprn"]]
searcher.ordnance_survey_client.parse_classification_code(p_os_df["CLASSIFICATION_CODE"].values[0])
# Now we estimate
searcher.newest_epc = searcher.estimate_epc(
property_type=searcher.ordnance_survey_client.property_type,
built_form=searcher.ordnance_survey_client.built_form,
lmks_to_drop=None,
exclude_old=True
)
elif searcher.newest_epc is None and property["match_type"] == "Fuzzy":
if "flat" in property["standardised_address"].lower():
searcher.newest_epc = searcher.estimate_epc(
property_type="Flat",
built_form=None,
lmks_to_drop=None,
exclude_old=True
)
else:
searcher.newest_epc = searcher.estimate_epc(
property_type="House",
built_form=None,
lmks_to_drop=None,
exclude_old=True
)
epc_data_batch_2.append(
{ {
**x.to_dict(), "internal_id": property["internal_id"],
"os_option_1_address": os_option_1_address, **searcher.newest_epc
"os_option_1_postcode": os_option_1_postcode,
"os_option_1_uprn": os_option_1_uprn,
"os_option_2_address": os_option_2_address,
"os_option_2_postcode": os_option_2_postcode,
"os_option_2_uprn": os_option_2_uprn,
} }
) )
manua_fix = pd.DataFrame(manua_fix) if searcher.older_epcs is not None:
# manua_fix.to_csv("manual_fix_uprns.csv") older_epcs_batch_2[property["internal_id"]] = searcher.older_epcs
# Split into chunks of 200
api_key = "ak_lxcapii7HnEhGKxuVmPquzTYKu9vp" def pull_ideal_postcodes(missing_uprn_with_udprn):
api_key = "" # Log into the platform the get the API key: https://account.ideal-postcodes.co.uk/
import requests import requests
import time import time
completed_id = 0 completed_id = 0
@ -1484,78 +1544,3 @@ def compile_data_final():
result["result"] result["result"]
) )
completed_id += 1 completed_id += 1
# Store in S3
# save_data_to_s3(
# data=json.dumps(uprn_to_udprn),
# s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json",
# bucket_name="retrofit-data-dev"
# )
test = read_from_s3(
s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json",
bucket_name="retrofit-data-dev"
)
test = pd.DataFrame(json.loads(test))
for _, x in missing_uprn.iterrows():
udprn = x["udprn"]
udprn = None if udprn == "<NA>" else udprn
internal_id = x["internal_id"]
is_flat = "flat" in x["address1"].lower()
# Get the OS data
final_os_data = pd.DataFrame()
if internal_id in os_most_relevant_1_internal_ids:
p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id]
p_os_data_all = os_all_1[str(internal_id)]
final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn, is_flat)
if (internal_id in os_most_relevant_2_internal_ids) and final_os_data.empty:
p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id]
p_os_data_all = os_all_2[str(internal_id)]
final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn, is_flat)
# Try signing up on a free trial with these guys!
# https://ideal-postcodes.co.uk/pricing
# API example: https://docs.ideal-postcodes.co.uk/docs/api/udprn
if final_os_data.empty:
boo
continue
if final_os_data.shape[0] != 1:
if final_os_data["UPRN"].nunique() > 1:
raise Exception("Investigate me")
# TODO: We should do a different variation of similarity, where we strip out "Flat" and "Room x" if they are there
# This is the first ordnance survey data pull
os_most_relevant_1 = []
os_all_1 = {}
for i in tqdm(["1", "2", "3"]):
most_relevant_segment = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name=f"customers/Stonewater/clustering/os_most_relevant_{i}.json"
)
os_most_relevant_1.extend(json.loads(most_relevant_segment))
os_all_segment = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name=f"customers/Stonewater/clustering/os_all_{i}.json"
)
os_all_1 = {**os_all_1, **json.loads(os_all_segment)}
os_most_relevant_1 = pd.DataFrame(os_most_relevant_1)
# This is the second ordnance survey data pull
os_most_relevant_2 = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/problematic_os.json"
)
os_most_relevant_2 = json.loads(os_most_relevant_2)
os_most_relevant_2 = pd.DataFrame(os_most_relevant_2)
os_all_2 = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/problematic_os_all.json"
)
os_all_2 = json.loads(os_all_2)