matched submissions

2026-07-27 23:35:01 +00:00 · 2025-04-13 21:54:32 +01:00 · 2025-04-13 21:54:32 +01:00 · 83a1ac8cf3
commit 83a1ac8cf3
parent 3cfe938e27
2 changed files with 23 additions and 9 deletions
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@ -5,7 +5,6 @@ import tiktoken
 from pprint import pprint
 from datetime import datetime

-from docutils.utils.math.tex2mathml_extern import blahtexml
 from openai import OpenAI
 import numpy as np
 import pandas as pd
@ -379,9 +378,10 @@ class AssetList:
        self.contact_details = None
        self.contact_detail_fields = None
        self.outcomes = None
-        self.outcomes_no_match = None
+        self.outcomes_no_match = pd.DataFrame()
        self.outcomes_for_output = pd.DataFrame()
        self.master_surveyed = None
+        self.unmatched_submissions = pd.DataFrame()

        # When this is True, we intend to break the programme into multiple phases. We may need to review
        # how this is structured in the future, as depending on how we get future data, we may need to
@ -2249,6 +2249,7 @@ class AssetList:

        logger.info("Getting masters and merging onto asset list")
        master_surveyed = []
+        unmatched_submissions = []
        for filepath in master_filepaths:
            master_data = pd.read_csv(filepath)
            # Strip columns
@ -2293,14 +2294,17 @@ class AssetList:
                axis=1
            )

+            postcode_col = "POSTCODE" if "POSTCODE" in master_data.columns else "Post Code"
+            house_no_col = 'NO.' if 'NO.' in master_data.columns else "NO"
+
            # Otherwise, we need to match algorithmically
            logger.info("Matching master data to asset list")
            matched = []
            unmatched = []
            for _, row in tqdm(master_data.iterrows(), total=len(master_data)):
-                if pd.isnull(row["POSTCODE"]):
+                if pd.isnull(row[postcode_col]):
                    continue
-                postcode_no_space = row["POSTCODE"].strip().replace(" ", "").lower()
+                postcode_no_space = row[postcode_col].strip().replace(" ", "").lower()

                df = self.standardised_asset_list[
                    (
@ -2310,7 +2314,7 @@ class AssetList:
                    )
                ]

-                house_no = row["NO"]
+                house_no = row[house_no_col]

                if house_no in df["house_no"].values:
                    df = df[df["house_no"] == house_no]
@ -2326,7 +2330,7 @@ class AssetList:
                            df = df[
                                df[self.STANDARD_FULL_ADDRESS].str.lower().apply(
                                    lambda x: process.extractOne(
-                                        " ".join([row["NO"], row["Street / Block Name"], row["TOWN"]]).lower(),
+                                        " ".join([row[house_no_col], row["Street / Block Name"], row["TOWN"]]).lower(),
                                        x
                                    )[1]
                                ) > 90
@ -2337,11 +2341,11 @@ class AssetList:
                                continue

                        if any(df[self.STANDARD_FULL_ADDRESS].str.lower().str.contains(
-                            " ".join([row["NO"], row["Street / Block Name"]]).lower()
+                            " ".join([row[house_no_col], row["Street / Block Name"]]).lower()
                        )):
                            df = df[
                                df[self.STANDARD_FULL_ADDRESS].str.lower().str.contains(
-                                    " ".join([row["NO"], row["Street / Block Name"]]).lower()
+                                    " ".join([row[house_no_col], row["Street / Block Name"]]).lower()
                                )
                            ]

@ -2384,7 +2388,7 @@ class AssetList:
            unmatched_df = master_data[
                master_data["row_id"].isin(unmatched)
            ]
-            submissions_unmatched.append(unmatched_df)
+            unmatched_submissions.append(unmatched_df)

        master_surveyed = pd.concat(master_surveyed)
        master_surveyed = master_surveyed[~pd.isnull(master_surveyed[self.STANDARD_LANDLORD_PROPERTY_ID])]
@ -2404,3 +2408,7 @@ class AssetList:
        self.standardised_asset_list = self.standardised_asset_list.merge(
            self.master_surveyed, how="left", on=self.STANDARD_LANDLORD_PROPERTY_ID
        )
+
+        # Finally, we keep a record of the unmatched
+        if unmatched_submissions:
+            self.unmatched_submissions = pd.concat(unmatched_submissions)
--- a/asset_list/app.py
+++ b/asset_list/app.py
@ -943,5 +943,11 @@ def app():
        if not asset_list.outcomes_for_output.empty:
            asset_list.outcomes_for_output.to_excel(writer, sheet_name="Outcomes", index=False)

+        if not asset_list.unmatched_submissions.empty:
+            asset_list.unmatched_submissions.to_excel(writer, sheet_name="Unmatched Submissions", index=False)
+
+        if not asset_list.outcomes_no_match.empty:
+            asset_list.outcomes_no_match.to_excel(writer, sheet_name="Unmatched Outcomes", index=False)
+
    # Store the Hubspot export as a csv
    hubspot_data.to_csv(os.path.join(data_folder, "Hubspot Export.csv"), index=False)