corrected spelling of built forms

This commit is contained in:
Khalim Conn-Kowlessar 2024-06-13 01:13:19 +01:00
parent 5e84967ee0
commit 6f9a78cabc
2 changed files with 106 additions and 121 deletions

View file

@ -117,8 +117,8 @@ class OrdnanceSuveyClient:
value_map = {
# In the OS api, "RD" is a "Dwelling" however this is not valid property type in the EPC database
'RD': {},
'RD02': {'property_type': 'House', 'built_form': 'Detatched'},
'RD03': {'property_type': 'House', 'built_form': 'Semi-Detatched'},
'RD02': {'property_type': 'House', 'built_form': 'Detached'},
'RD03': {'property_type': 'House', 'built_form': 'Semi-Detached'},
'RD04': {'property_type': 'House', 'built_form': 'Mid-Terrace'},
'RD06': {'property_type': 'Flat'},
}

View file

@ -1285,7 +1285,7 @@ def compile_data_final():
elif x["option"] == 2:
uprn = x["os_option_2_uprn"]
standardised_address = x["os_option_2_address"]
postcode = x["os_option_2_postcode"]
postcode = x["os_option_2_address"].split(", ")[-1]
else:
uprn = x["manual_uprn"]
standardised_address = x["manual_address"]
@ -1347,7 +1347,8 @@ def compile_data_final():
"City/Town",
"County",
"Address ID", # This is not uprn
"udprn"
"udprn",
"Owning body"
]
].rename(
columns={
@ -1360,6 +1361,7 @@ def compile_data_final():
"City/Town": "city_town",
"County": "county",
"Address ID": "external_address_id",
"Owning body": "owner"
}
)
@ -1400,59 +1402,117 @@ def compile_data_final():
on=["internal_id", "external_address_id"]
)
# This is everything without a uprn
missing_uprn = asset_list[pd.isnull(asset_list["uprn"])]
# Store locally
# asset_list.to_excel("Stonewater asset list with uprn.xlsx")
missing_uprn_with_udprn = missing_uprn[
missing_uprn["udprn"] != "<NA>"
].reset_index(drop=True)
# We take just domestic properties
missing_uprn_without_udprn = missing_uprn[
missing_uprn["udprn"] == "<NA>"
].reset_index(drop=True)
# This is the first ordnance survey data pull
os_most_relevant_1 = []
os_all_1 = {}
for i in tqdm(["1", "2", "3"]):
most_relevant_segment = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name=f"customers/Stonewater/clustering/os_most_relevant_{i}.json"
)
os_most_relevant_1.extend(json.loads(most_relevant_segment))
os_all_segment = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name=f"customers/Stonewater/clustering/os_all_{i}.json"
)
os_all_1 = {**os_all_1, **json.loads(os_all_segment)}
missing_uprn_without_udprn = missing_uprn_without_udprn[["internal_id", "external_address_id", "full_address"]]
# Pull in the best ordnance survey data for each one and manually fix
manua_fix = []
for _, x in missing_uprn_without_udprn.iterrows():
internal_id = x["internal_id"]
os_most_relevant_1 = pd.DataFrame(os_most_relevant_1)
os_option_1_address = ""
os_option_1_postcode = ""
os_option_1_uprn = ""
if internal_id in os_most_relevant_1_internal_ids:
p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id]
os_option_1_address = p_os_data["ADDRESS"].values[0]
os_option_1_postcode = p_os_data["POSTCODE"].values[0]
os_option_1_uprn = p_os_data["UPRN"].values[0]
# This is the second ordnance survey data pull
os_most_relevant_2 = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/problematic_os.json"
)
os_most_relevant_2 = json.loads(os_most_relevant_2)
os_most_relevant_2 = pd.DataFrame(os_most_relevant_2)
os_option_2_address = ""
os_option_2_postcode = ""
os_option_2_uprn = ""
if internal_id in os_most_relevant_2_internal_ids:
p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id]
os_option_2_address = p_os_data["ADDRESS"].values[0]
os_option_2_postcode = p_os_data["POSTCODE"].values[0]
os_option_2_uprn = p_os_data["UPRN"].values[0]
os_all_2 = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/problematic_os_all.json"
)
os_all_2 = json.loads(os_all_2)
manua_fix.append(
needs_epc_data = asset_list[~asset_list["internal_id"].isin(epc_data["internal_id"])]
os_1_ids = os_most_relevant_1["internal_id"].values
os_2_ids = os_most_relevant_2["internal_id"].values
epc_data_batch_2 = []
older_epcs_batch_2 = {}
for _, property in tqdm(needs_epc_data.iterrows(), total=len(needs_epc_data)):
if pd.isnull(property["uprn"]):
continue
searcher = SearchEpc(
address1=", ".join(property["standardised_address"].split(", ")[:-1]),
postcode=property["standardised_postcode"],
auth_token=EPC_AUTH_TOKEN,
os_api_key="",
full_address=property["standardised_address"],
uprn=property["uprn"]
)
searcher.find_property(skip_os=True)
if searcher.newest_epc is None and property["match_type"] == "Exact":
# Estimate!
# Get the OS data
p_os_df = pd.DataFrame()
if property["internal_id"] in os_1_ids:
p_os_df = pd.DataFrame(
[x["DPA"] if "DPA" in x else x["LPI"] for x in os_all_1[str(property["internal_id"])]]
)
p_os_df = p_os_df[p_os_df["UPRN"].astype(str) == property["uprn"]]
if p_os_df.empty:
p_os_df = pd.DataFrame(
[x["DPA"] if "DPA" in x else x["LPI"] for x in os_all_2[str(property["internal_id"])]]
)
p_os_df = p_os_df[p_os_df["UPRN"] == property["uprn"]]
searcher.ordnance_survey_client.parse_classification_code(p_os_df["CLASSIFICATION_CODE"].values[0])
# Now we estimate
searcher.newest_epc = searcher.estimate_epc(
property_type=searcher.ordnance_survey_client.property_type,
built_form=searcher.ordnance_survey_client.built_form,
lmks_to_drop=None,
exclude_old=True
)
elif searcher.newest_epc is None and property["match_type"] == "Fuzzy":
if "flat" in property["standardised_address"].lower():
searcher.newest_epc = searcher.estimate_epc(
property_type="Flat",
built_form=None,
lmks_to_drop=None,
exclude_old=True
)
else:
searcher.newest_epc = searcher.estimate_epc(
property_type="House",
built_form=None,
lmks_to_drop=None,
exclude_old=True
)
epc_data_batch_2.append(
{
**x.to_dict(),
"os_option_1_address": os_option_1_address,
"os_option_1_postcode": os_option_1_postcode,
"os_option_1_uprn": os_option_1_uprn,
"os_option_2_address": os_option_2_address,
"os_option_2_postcode": os_option_2_postcode,
"os_option_2_uprn": os_option_2_uprn,
"internal_id": property["internal_id"],
**searcher.newest_epc
}
)
manua_fix = pd.DataFrame(manua_fix)
# manua_fix.to_csv("manual_fix_uprns.csv")
if searcher.older_epcs is not None:
older_epcs_batch_2[property["internal_id"]] = searcher.older_epcs
# Split into chunks of 200
api_key = "ak_lxcapii7HnEhGKxuVmPquzTYKu9vp"
def pull_ideal_postcodes(missing_uprn_with_udprn):
api_key = "" # Log into the platform the get the API key: https://account.ideal-postcodes.co.uk/
import requests
import time
completed_id = 0
@ -1484,78 +1544,3 @@ def compile_data_final():
result["result"]
)
completed_id += 1
# Store in S3
# save_data_to_s3(
# data=json.dumps(uprn_to_udprn),
# s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json",
# bucket_name="retrofit-data-dev"
# )
test = read_from_s3(
s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json",
bucket_name="retrofit-data-dev"
)
test = pd.DataFrame(json.loads(test))
for _, x in missing_uprn.iterrows():
udprn = x["udprn"]
udprn = None if udprn == "<NA>" else udprn
internal_id = x["internal_id"]
is_flat = "flat" in x["address1"].lower()
# Get the OS data
final_os_data = pd.DataFrame()
if internal_id in os_most_relevant_1_internal_ids:
p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id]
p_os_data_all = os_all_1[str(internal_id)]
final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn, is_flat)
if (internal_id in os_most_relevant_2_internal_ids) and final_os_data.empty:
p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id]
p_os_data_all = os_all_2[str(internal_id)]
final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn, is_flat)
# Try signing up on a free trial with these guys!
# https://ideal-postcodes.co.uk/pricing
# API example: https://docs.ideal-postcodes.co.uk/docs/api/udprn
if final_os_data.empty:
boo
continue
if final_os_data.shape[0] != 1:
if final_os_data["UPRN"].nunique() > 1:
raise Exception("Investigate me")
# TODO: We should do a different variation of similarity, where we strip out "Flat" and "Room x" if they are there
# This is the first ordnance survey data pull
os_most_relevant_1 = []
os_all_1 = {}
for i in tqdm(["1", "2", "3"]):
most_relevant_segment = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name=f"customers/Stonewater/clustering/os_most_relevant_{i}.json"
)
os_most_relevant_1.extend(json.loads(most_relevant_segment))
os_all_segment = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name=f"customers/Stonewater/clustering/os_all_{i}.json"
)
os_all_1 = {**os_all_1, **json.loads(os_all_segment)}
os_most_relevant_1 = pd.DataFrame(os_most_relevant_1)
# This is the second ordnance survey data pull
os_most_relevant_2 = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/problematic_os.json"
)
os_most_relevant_2 = json.loads(os_most_relevant_2)
os_most_relevant_2 = pd.DataFrame(os_most_relevant_2)
os_all_2 = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/problematic_os_all.json"
)
os_all_2 = json.loads(os_all_2)