expanded eco3 matching

This commit is contained in:
Khalim Conn-Kowlessar 2024-03-07 14:18:08 +00:00
parent 7f88f0e0f5
commit 9a0c6c3e8f

View file

@ -172,7 +172,7 @@ class DataLoader:
}
UNMATCHED_ECO3 = {
"HA25": 119
"HA25": 154
}
def __init__(self, directories, december_figures_filepath, use_cache, rebuild):
@ -1508,12 +1508,16 @@ class DataLoader:
eco3_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_eco3_list")
eco3_list = eco3_list_correction_function(eco3_list)
asset_list["matching_postcode_nospace"] = asset_list["matching_postcode"].str.replace(" ", "").str.lower()
eco3_list["postcode_no_space"] = eco3_list["Post Code"].str.lower().str.replace(" ", "")
if ha_name == "HA25":
# 317 -> 259
missed_postcodes = {
postcode.lower() for postcode in eco3_list["Post Code"] if
postcode.lower() not in asset_list["matching_postcode"].values
postcode for postcode in eco3_list["postcode_no_space"] if
postcode not in asset_list["matching_postcode_nospace"].values
}
eco3_list = eco3_list[~eco3_list["Post Code"].str.lower().isin(missed_postcodes)]
eco3_list = eco3_list[~eco3_list["postcode_no_space"].isin(missed_postcodes)]
# For the asset list, we create a matching address without any punctuation
# TODO: We should generally just remove puncutation from addresses when matching
@ -1530,11 +1534,11 @@ class DataLoader:
for _, row in tqdm(eco3_list.iterrows(), total=len(eco3_list)):
# if row["eco3_list_row_id"] == "HA25_Eco3_5422":
# raise Exception()
postcode = row["Post Code"].lower().strip()
postcode = row["postcode_no_space"]
# df will never be empty, since we've already done a check for common postcodes
df = asset_list[
asset_list["matching_postcode"].str.contains(postcode)
asset_list["matching_postcode_nospace"].str.contains(postcode)
]
house_number = row["NO "]
@ -1588,6 +1592,8 @@ class DataLoader:
f"Unmatched addresses for {ha_name} is not as expected, got {len(missed)} unmatched"
)
# 154 missed, 2827 matched for HA 25
matching_lookup = pd.DataFrame(matching_lookup)
# Check dupes as this will cause problems later on
if matching_lookup["asset_list_row_id"].duplicated().any():