From b09bd63b53c8d9b14f11c1c5b7cb38b28c63afbc Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 7 Mar 2024 12:53:25 +0000 Subject: [PATCH] done with ha25 matching for now --- .../ha_15_32/ha_analysis_batch_3.py | 66 +++++++++++-------- 1 file changed, 38 insertions(+), 28 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index a5845990..f0813aef 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -172,7 +172,7 @@ class DataLoader: } UNMATCHED_ECO3 = { - "HA25": 94 + "HA25": 119 } def __init__(self, directories, december_figures_filepath, use_cache, rebuild): @@ -478,7 +478,7 @@ class DataLoader: # For HA1 and HA25, there is an exception in the structure of the data. We don't have any survey or ciga # lists, and so # we can return the asset list now - if ha_name in ["HA1", "HA25"]: + if ha_name in ["HA1"]: return asset_list, pd.DataFrame(), pd.DataFrame() # If we have ECO3 surveys, we need to match them, because any properties treated under ECO3 won't be @@ -1460,10 +1460,8 @@ class DataLoader: return survey_list - def merge_eco3_to_assets(self, asset_list, eco3_list, ha_name): - - # May need an eco3 list correction function - + @staticmethod + def correct_ha25_eco3_list(eco3_list): # NEADS DRIVE, postcode with bs305dt, is not found in the asset list eco3_list = eco3_list[ ~(eco3_list["Post Code"] == "BS305DT") @@ -1483,6 +1481,29 @@ class DataLoader: "Hall Road", "Hall Rd" ) + eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( + "SPRINGFIELD WAY SAINT DAY", "SPRINGFIELD WAY ST DAY" + ) + eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( + "BOND SPEAR COURT", "BOND-SPEAR COURT" + ) + eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( + "ST.MARYS HILL", "ST MARYS HILL" + ) + # Correct the postcode for edmund road + eco3_list["Post Code"] = np.where( + (eco3_list["Street / Block Name"] == "EDMUND ROAD") & + (eco3_list["Post Code"] == "TR14 8QJ"), + "TR15 1BY", + eco3_list["Post Code"] + ) + return eco3_list + + def merge_eco3_to_assets(self, asset_list, eco3_list, ha_name): + + eco3_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_eco3_list") + eco3_list = eco3_list_correction_function(eco3_list) + if ha_name == "HA25": missed_postcodes = { postcode.lower() for postcode in eco3_list["Post Code"] if @@ -1492,8 +1513,9 @@ class DataLoader: # For the asset list, we create a matching address without any punctuation # TODO: We should generally just remove puncutation from addresses when matching - asset_list['matching_address_no_punctuation'] = asset_list['matching_address'].str.replace(r'[^\w\s]', '', - regex=True) + asset_list['matching_address_no_punctuation'] = asset_list['matching_address'].str.replace( + r'[^\w\s]', '', regex=True + ) # Remove double spaces asset_list["matching_address_no_punctuation"] = asset_list["matching_address_no_punctuation"].str.replace( " ", " " @@ -1502,6 +1524,8 @@ class DataLoader: matching_lookup = [] missed = [] for _, row in tqdm(eco3_list.iterrows(), total=len(eco3_list)): + # if row["eco3_list_row_id"] == "HA25_Eco3_5422": + # raise Exception() postcode = row["Post Code"].lower().strip() # df will never be empty, since we've already done a check for common postcodes @@ -1553,38 +1577,24 @@ class DataLoader: ) # We verify the missed - # -HA25 contains 88 missed entries. These are actually 8 unique postcodes, where surveys were conducted - # on properties that had house numbers outside of the asset list + # HA25 contains 119 missed entries. These are actually 24 unique postcodes, and the majority belong to 2 + # where many surveys were conducted on house numbers, not in the asset list if len(missed) != self.UNMATCHED_ECO3[ha_name]: raise ValueError( f"Unmatched addresses for {ha_name} is not as expected, got {len(missed)} unmatched" ) - # TODO: 194 missed - matching_lookup = pd.DataFrame(matching_lookup) # Check dupes as this will cause problems later on if matching_lookup["asset_list_row_id"].duplicated().any(): raise ValueError("Duplicated asset list row ids") - missed_df = eco3_list[eco3_list["eco3_list_row_id"].isin(missed)] - missed_df.head(3).tail(1)["eco3_list_row_id"] + # Merge onto eco3 list + eco3_list = eco3_list.merge(matching_lookup, how="left", on="eco3_list_row_id") - duped_ids = matching_lookup[matching_lookup["asset_list_row_id"].duplicated()]["asset_list_row_id"].tolist() - duped_df = matching_lookup[ - matching_lookup["asset_list_row_id"].isin(duped_ids) - ] - duped_surveys = eco3_list[ - eco3_list["eco3_list_row_id"].isin(duped_df["eco3_list_row_id"].values) - ].copy() + asset_list = asset_list.drop(columns=["matching_address_no_punctuation"]) - duped_surveys = duped_surveys.merge(matching_lookup, how="left", on="eco3_list_row_id") - - duped_surveys[ - ["NO ", "Street / Block Name", "Post Code", "eco3_list_row_id", "asset_list_row_id"] - ].sort_values("asset_list_row_id").head() - - asset_list[asset_list["asset_list_row_id"] == "HA2515145"]["matching_address"].values + return eco3_list @staticmethod def extract_streetname(address, house_number=None, postcode=None):