From 7291f7128e6b5403132e5afdcc56330ea3d71f15 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 29 Jan 2025 21:11:29 +0000 Subject: [PATCH] started wates matching --- .../stonewater/Wave 3 Preparation.py | 119 +++++++++++++----- 1 file changed, 91 insertions(+), 28 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index fa548f0d..cbbf04c6 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3331,7 +3331,7 @@ def revised_model(): ) if to_filter.sum() == 0: - blah + raise Exception("Error") filtered = filtered[to_filter] if filtered.empty: @@ -3347,34 +3347,97 @@ def revised_model(): ) continue - blah2 + raise Exception("No match") - # home["Name"] should be contained in the survey_folder - # filtered = filtered[filtered["survey_folder"].str.contains(home["Name"], case=False)] - # # We have an edge case wher some properties have two outputs in Sharepoint - # if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ": - # raise Exception("Fix me1") - # # filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"] - # - # if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB': - # raise Exception("Fix me2") - # # filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"] - # - # if home["Name"] == '2 Bromyard Road' and home["Postcode"] == 'WR15 8BZ': - # filtered = filtered[filtered["survey_folder"] == "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ"] - # - # if filtered.empty: - # continue - # if filtered.shape[0] != 1: - # raise Exception("something went wrong") - # - # matching_lookup.append( - # { - # "survey_folder": filtered["survey_folder"].values[0], - # "Address ID": home["Address ID"], - # "Name": home["Name"] - # } - # ) + ccs_matching_lookup = pd.DataFrame(ccs_matching_lookup) + # We get a match for all records + assert ccs_matching_lookup.shape[0] == ccs_coordination.shape[0] + assert not pd.isnull(ccs_matching_lookup["Asset ID.1"]).sum() + + # We do the same for Wates + wates_coordination = wates_coordination.rename( + columns={"Post Code": "Postcode"} + ) + wates_coordination = wates_coordination[ + wates_coordination["Retrofit Assessment"].isin(["Completed"]) + ] + + wates_manual_filters = {} + wates_matching_lookup = [] + for _, home in tqdm(wates_coordination.iterrows(), total=len(wates_coordination)): + + # Handle the case that has the wrong postcode in the asset data + if home["Name"] in wates_manual_filters: + filtered = retrofit_assessment_data[ + retrofit_assessment_data["survey_folder"] == wates_manual_filters[home["Name"]] + ].copy() + else: + filtered = retrofit_assessment_data[ + retrofit_assessment_data["Postcode"].str.lower() == home["Postcode"].lower() + ].copy() + + # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces + to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains( + home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False + ) + if to_filter.sum() == 0: + to_filter = ( + filtered["survey_folder"]. + str.replace(r"[^\w\s]", ""). + str.replace(",", ""). + str.replace(".", ""). + str.contains( + home["Name"].replace(r"[^\w\s]", "").replace(",", ""), case=False + ) + ) + if to_filter.sum() == 0: + to_filter = ( + filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("").str.lower() == + home["Name"].lower() + ) + if to_filter.sum() == 0: + to_filter = ( + filtered["Address"].str.replace(" ,", "").str.split(",").str[0:1].str.join("").str.lower() == + home["Name"].lower() + ) + if to_filter.sum() == 0: + # Do a fuzzy match on the name + # Find the best filter + to_filter = filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("").apply( + lambda x: fuzz.partial_ratio(home["Name"], x) > 93 + ) + if to_filter.sum() == 0: + # We also some cases where the name of the survey folder is like "Colville Road 7" and the + # property name is actually 7 Colville Road, so we try taking the final part of the address, + # splitting on space, and adding it to the front + def reformat_survey_folder(x): + filename = x.split("/")[-1] + parts = filename.split(" ") + return " ".join(parts[-1:] + parts[:-1]) + + to_filter = ( + filtered["survey_folder"].apply(lambda x: reformat_survey_folder(x)).str.lower() == + home["Name"].lower() + ) + + if to_filter.sum() == 0: + raise Exception("Error") + filtered = filtered[to_filter] + + if filtered.empty: + continue + + if filtered.shape[0] == 1: + wates_matching_lookup.append( + { + "survey_folder": filtered["survey_folder"].values[0], + "Asset ID": home["Asset ID"], + "Name": home["Name"] + } + ) + continue + + raise Exception("No match") # if __name__ == "__main__": # main()