mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
corrected spelling of built forms
This commit is contained in:
parent
5e84967ee0
commit
6f9a78cabc
2 changed files with 106 additions and 121 deletions
|
|
@ -117,8 +117,8 @@ class OrdnanceSuveyClient:
|
|||
value_map = {
|
||||
# In the OS api, "RD" is a "Dwelling" however this is not valid property type in the EPC database
|
||||
'RD': {},
|
||||
'RD02': {'property_type': 'House', 'built_form': 'Detatched'},
|
||||
'RD03': {'property_type': 'House', 'built_form': 'Semi-Detatched'},
|
||||
'RD02': {'property_type': 'House', 'built_form': 'Detached'},
|
||||
'RD03': {'property_type': 'House', 'built_form': 'Semi-Detached'},
|
||||
'RD04': {'property_type': 'House', 'built_form': 'Mid-Terrace'},
|
||||
'RD06': {'property_type': 'Flat'},
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1285,7 +1285,7 @@ def compile_data_final():
|
|||
elif x["option"] == 2:
|
||||
uprn = x["os_option_2_uprn"]
|
||||
standardised_address = x["os_option_2_address"]
|
||||
postcode = x["os_option_2_postcode"]
|
||||
postcode = x["os_option_2_address"].split(", ")[-1]
|
||||
else:
|
||||
uprn = x["manual_uprn"]
|
||||
standardised_address = x["manual_address"]
|
||||
|
|
@ -1347,7 +1347,8 @@ def compile_data_final():
|
|||
"City/Town",
|
||||
"County",
|
||||
"Address ID", # This is not uprn
|
||||
"udprn"
|
||||
"udprn",
|
||||
"Owning body"
|
||||
]
|
||||
].rename(
|
||||
columns={
|
||||
|
|
@ -1360,6 +1361,7 @@ def compile_data_final():
|
|||
"City/Town": "city_town",
|
||||
"County": "county",
|
||||
"Address ID": "external_address_id",
|
||||
"Owning body": "owner"
|
||||
}
|
||||
)
|
||||
|
||||
|
|
@ -1400,59 +1402,117 @@ def compile_data_final():
|
|||
on=["internal_id", "external_address_id"]
|
||||
)
|
||||
|
||||
# This is everything without a uprn
|
||||
missing_uprn = asset_list[pd.isnull(asset_list["uprn"])]
|
||||
# Store locally
|
||||
# asset_list.to_excel("Stonewater asset list with uprn.xlsx")
|
||||
|
||||
missing_uprn_with_udprn = missing_uprn[
|
||||
missing_uprn["udprn"] != "<NA>"
|
||||
].reset_index(drop=True)
|
||||
# We take just domestic properties
|
||||
|
||||
missing_uprn_without_udprn = missing_uprn[
|
||||
missing_uprn["udprn"] == "<NA>"
|
||||
].reset_index(drop=True)
|
||||
# This is the first ordnance survey data pull
|
||||
os_most_relevant_1 = []
|
||||
os_all_1 = {}
|
||||
for i in tqdm(["1", "2", "3"]):
|
||||
most_relevant_segment = read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name=f"customers/Stonewater/clustering/os_most_relevant_{i}.json"
|
||||
)
|
||||
os_most_relevant_1.extend(json.loads(most_relevant_segment))
|
||||
os_all_segment = read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name=f"customers/Stonewater/clustering/os_all_{i}.json"
|
||||
)
|
||||
os_all_1 = {**os_all_1, **json.loads(os_all_segment)}
|
||||
|
||||
missing_uprn_without_udprn = missing_uprn_without_udprn[["internal_id", "external_address_id", "full_address"]]
|
||||
# Pull in the best ordnance survey data for each one and manually fix
|
||||
manua_fix = []
|
||||
for _, x in missing_uprn_without_udprn.iterrows():
|
||||
internal_id = x["internal_id"]
|
||||
os_most_relevant_1 = pd.DataFrame(os_most_relevant_1)
|
||||
|
||||
os_option_1_address = ""
|
||||
os_option_1_postcode = ""
|
||||
os_option_1_uprn = ""
|
||||
if internal_id in os_most_relevant_1_internal_ids:
|
||||
p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id]
|
||||
os_option_1_address = p_os_data["ADDRESS"].values[0]
|
||||
os_option_1_postcode = p_os_data["POSTCODE"].values[0]
|
||||
os_option_1_uprn = p_os_data["UPRN"].values[0]
|
||||
# This is the second ordnance survey data pull
|
||||
os_most_relevant_2 = read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name="customers/Stonewater/clustering/problematic_os.json"
|
||||
)
|
||||
os_most_relevant_2 = json.loads(os_most_relevant_2)
|
||||
os_most_relevant_2 = pd.DataFrame(os_most_relevant_2)
|
||||
|
||||
os_option_2_address = ""
|
||||
os_option_2_postcode = ""
|
||||
os_option_2_uprn = ""
|
||||
if internal_id in os_most_relevant_2_internal_ids:
|
||||
p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id]
|
||||
os_option_2_address = p_os_data["ADDRESS"].values[0]
|
||||
os_option_2_postcode = p_os_data["POSTCODE"].values[0]
|
||||
os_option_2_uprn = p_os_data["UPRN"].values[0]
|
||||
os_all_2 = read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name="customers/Stonewater/clustering/problematic_os_all.json"
|
||||
)
|
||||
os_all_2 = json.loads(os_all_2)
|
||||
|
||||
manua_fix.append(
|
||||
needs_epc_data = asset_list[~asset_list["internal_id"].isin(epc_data["internal_id"])]
|
||||
|
||||
os_1_ids = os_most_relevant_1["internal_id"].values
|
||||
os_2_ids = os_most_relevant_2["internal_id"].values
|
||||
|
||||
epc_data_batch_2 = []
|
||||
older_epcs_batch_2 = {}
|
||||
for _, property in tqdm(needs_epc_data.iterrows(), total=len(needs_epc_data)):
|
||||
if pd.isnull(property["uprn"]):
|
||||
continue
|
||||
searcher = SearchEpc(
|
||||
address1=", ".join(property["standardised_address"].split(", ")[:-1]),
|
||||
postcode=property["standardised_postcode"],
|
||||
auth_token=EPC_AUTH_TOKEN,
|
||||
os_api_key="",
|
||||
full_address=property["standardised_address"],
|
||||
uprn=property["uprn"]
|
||||
)
|
||||
searcher.find_property(skip_os=True)
|
||||
|
||||
if searcher.newest_epc is None and property["match_type"] == "Exact":
|
||||
# Estimate!
|
||||
# Get the OS data
|
||||
p_os_df = pd.DataFrame()
|
||||
if property["internal_id"] in os_1_ids:
|
||||
p_os_df = pd.DataFrame(
|
||||
[x["DPA"] if "DPA" in x else x["LPI"] for x in os_all_1[str(property["internal_id"])]]
|
||||
)
|
||||
p_os_df = p_os_df[p_os_df["UPRN"].astype(str) == property["uprn"]]
|
||||
|
||||
if p_os_df.empty:
|
||||
p_os_df = pd.DataFrame(
|
||||
[x["DPA"] if "DPA" in x else x["LPI"] for x in os_all_2[str(property["internal_id"])]]
|
||||
)
|
||||
p_os_df = p_os_df[p_os_df["UPRN"] == property["uprn"]]
|
||||
|
||||
searcher.ordnance_survey_client.parse_classification_code(p_os_df["CLASSIFICATION_CODE"].values[0])
|
||||
# Now we estimate
|
||||
searcher.newest_epc = searcher.estimate_epc(
|
||||
property_type=searcher.ordnance_survey_client.property_type,
|
||||
built_form=searcher.ordnance_survey_client.built_form,
|
||||
lmks_to_drop=None,
|
||||
exclude_old=True
|
||||
)
|
||||
|
||||
elif searcher.newest_epc is None and property["match_type"] == "Fuzzy":
|
||||
|
||||
if "flat" in property["standardised_address"].lower():
|
||||
searcher.newest_epc = searcher.estimate_epc(
|
||||
property_type="Flat",
|
||||
built_form=None,
|
||||
lmks_to_drop=None,
|
||||
exclude_old=True
|
||||
)
|
||||
else:
|
||||
searcher.newest_epc = searcher.estimate_epc(
|
||||
property_type="House",
|
||||
built_form=None,
|
||||
lmks_to_drop=None,
|
||||
exclude_old=True
|
||||
)
|
||||
|
||||
epc_data_batch_2.append(
|
||||
{
|
||||
**x.to_dict(),
|
||||
"os_option_1_address": os_option_1_address,
|
||||
"os_option_1_postcode": os_option_1_postcode,
|
||||
"os_option_1_uprn": os_option_1_uprn,
|
||||
|
||||
"os_option_2_address": os_option_2_address,
|
||||
"os_option_2_postcode": os_option_2_postcode,
|
||||
"os_option_2_uprn": os_option_2_uprn,
|
||||
"internal_id": property["internal_id"],
|
||||
**searcher.newest_epc
|
||||
}
|
||||
)
|
||||
|
||||
manua_fix = pd.DataFrame(manua_fix)
|
||||
# manua_fix.to_csv("manual_fix_uprns.csv")
|
||||
if searcher.older_epcs is not None:
|
||||
older_epcs_batch_2[property["internal_id"]] = searcher.older_epcs
|
||||
|
||||
# Split into chunks of 200
|
||||
api_key = "ak_lxcapii7HnEhGKxuVmPquzTYKu9vp"
|
||||
|
||||
def pull_ideal_postcodes(missing_uprn_with_udprn):
|
||||
api_key = "" # Log into the platform the get the API key: https://account.ideal-postcodes.co.uk/
|
||||
import requests
|
||||
import time
|
||||
completed_id = 0
|
||||
|
|
@ -1484,78 +1544,3 @@ def compile_data_final():
|
|||
result["result"]
|
||||
)
|
||||
completed_id += 1
|
||||
|
||||
# Store in S3
|
||||
# save_data_to_s3(
|
||||
# data=json.dumps(uprn_to_udprn),
|
||||
# s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json",
|
||||
# bucket_name="retrofit-data-dev"
|
||||
# )
|
||||
|
||||
test = read_from_s3(
|
||||
s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json",
|
||||
bucket_name="retrofit-data-dev"
|
||||
)
|
||||
test = pd.DataFrame(json.loads(test))
|
||||
|
||||
for _, x in missing_uprn.iterrows():
|
||||
udprn = x["udprn"]
|
||||
udprn = None if udprn == "<NA>" else udprn
|
||||
internal_id = x["internal_id"]
|
||||
|
||||
is_flat = "flat" in x["address1"].lower()
|
||||
# Get the OS data
|
||||
final_os_data = pd.DataFrame()
|
||||
if internal_id in os_most_relevant_1_internal_ids:
|
||||
p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id]
|
||||
p_os_data_all = os_all_1[str(internal_id)]
|
||||
final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn, is_flat)
|
||||
|
||||
if (internal_id in os_most_relevant_2_internal_ids) and final_os_data.empty:
|
||||
p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id]
|
||||
p_os_data_all = os_all_2[str(internal_id)]
|
||||
final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn, is_flat)
|
||||
|
||||
# Try signing up on a free trial with these guys!
|
||||
# https://ideal-postcodes.co.uk/pricing
|
||||
# API example: https://docs.ideal-postcodes.co.uk/docs/api/udprn
|
||||
|
||||
if final_os_data.empty:
|
||||
boo
|
||||
continue
|
||||
|
||||
if final_os_data.shape[0] != 1:
|
||||
if final_os_data["UPRN"].nunique() > 1:
|
||||
raise Exception("Investigate me")
|
||||
|
||||
# TODO: We should do a different variation of similarity, where we strip out "Flat" and "Room x" if they are there
|
||||
# This is the first ordnance survey data pull
|
||||
os_most_relevant_1 = []
|
||||
os_all_1 = {}
|
||||
for i in tqdm(["1", "2", "3"]):
|
||||
most_relevant_segment = read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name=f"customers/Stonewater/clustering/os_most_relevant_{i}.json"
|
||||
)
|
||||
os_most_relevant_1.extend(json.loads(most_relevant_segment))
|
||||
os_all_segment = read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name=f"customers/Stonewater/clustering/os_all_{i}.json"
|
||||
)
|
||||
os_all_1 = {**os_all_1, **json.loads(os_all_segment)}
|
||||
|
||||
os_most_relevant_1 = pd.DataFrame(os_most_relevant_1)
|
||||
|
||||
# This is the second ordnance survey data pull
|
||||
os_most_relevant_2 = read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name="customers/Stonewater/clustering/problematic_os.json"
|
||||
)
|
||||
os_most_relevant_2 = json.loads(os_most_relevant_2)
|
||||
os_most_relevant_2 = pd.DataFrame(os_most_relevant_2)
|
||||
|
||||
os_all_2 = read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name="customers/Stonewater/clustering/problematic_os_all.json"
|
||||
)
|
||||
os_all_2 = json.loads(os_all_2)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue