From 83a1ac8cf347bdb7538b45f577263771da86b0a9 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 13 Apr 2025 21:54:32 +0100 Subject: [PATCH] matched submissions --- asset_list/AssetList.py | 26 +++++++++++++++++--------- asset_list/app.py | 6 ++++++ 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 48ea22f4..9657f289 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -5,7 +5,6 @@ import tiktoken from pprint import pprint from datetime import datetime -from docutils.utils.math.tex2mathml_extern import blahtexml from openai import OpenAI import numpy as np import pandas as pd @@ -379,9 +378,10 @@ class AssetList: self.contact_details = None self.contact_detail_fields = None self.outcomes = None - self.outcomes_no_match = None + self.outcomes_no_match = pd.DataFrame() self.outcomes_for_output = pd.DataFrame() self.master_surveyed = None + self.unmatched_submissions = pd.DataFrame() # When this is True, we intend to break the programme into multiple phases. We may need to review # how this is structured in the future, as depending on how we get future data, we may need to @@ -2249,6 +2249,7 @@ class AssetList: logger.info("Getting masters and merging onto asset list") master_surveyed = [] + unmatched_submissions = [] for filepath in master_filepaths: master_data = pd.read_csv(filepath) # Strip columns @@ -2293,14 +2294,17 @@ class AssetList: axis=1 ) + postcode_col = "POSTCODE" if "POSTCODE" in master_data.columns else "Post Code" + house_no_col = 'NO.' if 'NO.' in master_data.columns else "NO" + # Otherwise, we need to match algorithmically logger.info("Matching master data to asset list") matched = [] unmatched = [] for _, row in tqdm(master_data.iterrows(), total=len(master_data)): - if pd.isnull(row["POSTCODE"]): + if pd.isnull(row[postcode_col]): continue - postcode_no_space = row["POSTCODE"].strip().replace(" ", "").lower() + postcode_no_space = row[postcode_col].strip().replace(" ", "").lower() df = self.standardised_asset_list[ ( @@ -2310,7 +2314,7 @@ class AssetList: ) ] - house_no = row["NO"] + house_no = row[house_no_col] if house_no in df["house_no"].values: df = df[df["house_no"] == house_no] @@ -2326,7 +2330,7 @@ class AssetList: df = df[ df[self.STANDARD_FULL_ADDRESS].str.lower().apply( lambda x: process.extractOne( - " ".join([row["NO"], row["Street / Block Name"], row["TOWN"]]).lower(), + " ".join([row[house_no_col], row["Street / Block Name"], row["TOWN"]]).lower(), x )[1] ) > 90 @@ -2337,11 +2341,11 @@ class AssetList: continue if any(df[self.STANDARD_FULL_ADDRESS].str.lower().str.contains( - " ".join([row["NO"], row["Street / Block Name"]]).lower() + " ".join([row[house_no_col], row["Street / Block Name"]]).lower() )): df = df[ df[self.STANDARD_FULL_ADDRESS].str.lower().str.contains( - " ".join([row["NO"], row["Street / Block Name"]]).lower() + " ".join([row[house_no_col], row["Street / Block Name"]]).lower() ) ] @@ -2384,7 +2388,7 @@ class AssetList: unmatched_df = master_data[ master_data["row_id"].isin(unmatched) ] - submissions_unmatched.append(unmatched_df) + unmatched_submissions.append(unmatched_df) master_surveyed = pd.concat(master_surveyed) master_surveyed = master_surveyed[~pd.isnull(master_surveyed[self.STANDARD_LANDLORD_PROPERTY_ID])] @@ -2404,3 +2408,7 @@ class AssetList: self.standardised_asset_list = self.standardised_asset_list.merge( self.master_surveyed, how="left", on=self.STANDARD_LANDLORD_PROPERTY_ID ) + + # Finally, we keep a record of the unmatched + if unmatched_submissions: + self.unmatched_submissions = pd.concat(unmatched_submissions) diff --git a/asset_list/app.py b/asset_list/app.py index ee74b337..a284371e 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -943,5 +943,11 @@ def app(): if not asset_list.outcomes_for_output.empty: asset_list.outcomes_for_output.to_excel(writer, sheet_name="Outcomes", index=False) + if not asset_list.unmatched_submissions.empty: + asset_list.unmatched_submissions.to_excel(writer, sheet_name="Unmatched Submissions", index=False) + + if not asset_list.outcomes_no_match.empty: + asset_list.outcomes_no_match.to_excel(writer, sheet_name="Unmatched Outcomes", index=False) + # Store the Hubspot export as a csv hubspot_data.to_csv(os.path.join(data_folder, "Hubspot Export.csv"), index=False)