matched submissions

This commit is contained in:
Khalim Conn-Kowlessar 2025-04-13 21:54:32 +01:00
parent 3cfe938e27
commit 83a1ac8cf3
2 changed files with 23 additions and 9 deletions

View file

@ -5,7 +5,6 @@ import tiktoken
from pprint import pprint
from datetime import datetime
from docutils.utils.math.tex2mathml_extern import blahtexml
from openai import OpenAI
import numpy as np
import pandas as pd
@ -379,9 +378,10 @@ class AssetList:
self.contact_details = None
self.contact_detail_fields = None
self.outcomes = None
self.outcomes_no_match = None
self.outcomes_no_match = pd.DataFrame()
self.outcomes_for_output = pd.DataFrame()
self.master_surveyed = None
self.unmatched_submissions = pd.DataFrame()
# When this is True, we intend to break the programme into multiple phases. We may need to review
# how this is structured in the future, as depending on how we get future data, we may need to
@ -2249,6 +2249,7 @@ class AssetList:
logger.info("Getting masters and merging onto asset list")
master_surveyed = []
unmatched_submissions = []
for filepath in master_filepaths:
master_data = pd.read_csv(filepath)
# Strip columns
@ -2293,14 +2294,17 @@ class AssetList:
axis=1
)
postcode_col = "POSTCODE" if "POSTCODE" in master_data.columns else "Post Code"
house_no_col = 'NO.' if 'NO.' in master_data.columns else "NO"
# Otherwise, we need to match algorithmically
logger.info("Matching master data to asset list")
matched = []
unmatched = []
for _, row in tqdm(master_data.iterrows(), total=len(master_data)):
if pd.isnull(row["POSTCODE"]):
if pd.isnull(row[postcode_col]):
continue
postcode_no_space = row["POSTCODE"].strip().replace(" ", "").lower()
postcode_no_space = row[postcode_col].strip().replace(" ", "").lower()
df = self.standardised_asset_list[
(
@ -2310,7 +2314,7 @@ class AssetList:
)
]
house_no = row["NO"]
house_no = row[house_no_col]
if house_no in df["house_no"].values:
df = df[df["house_no"] == house_no]
@ -2326,7 +2330,7 @@ class AssetList:
df = df[
df[self.STANDARD_FULL_ADDRESS].str.lower().apply(
lambda x: process.extractOne(
" ".join([row["NO"], row["Street / Block Name"], row["TOWN"]]).lower(),
" ".join([row[house_no_col], row["Street / Block Name"], row["TOWN"]]).lower(),
x
)[1]
) > 90
@ -2337,11 +2341,11 @@ class AssetList:
continue
if any(df[self.STANDARD_FULL_ADDRESS].str.lower().str.contains(
" ".join([row["NO"], row["Street / Block Name"]]).lower()
" ".join([row[house_no_col], row["Street / Block Name"]]).lower()
)):
df = df[
df[self.STANDARD_FULL_ADDRESS].str.lower().str.contains(
" ".join([row["NO"], row["Street / Block Name"]]).lower()
" ".join([row[house_no_col], row["Street / Block Name"]]).lower()
)
]
@ -2384,7 +2388,7 @@ class AssetList:
unmatched_df = master_data[
master_data["row_id"].isin(unmatched)
]
submissions_unmatched.append(unmatched_df)
unmatched_submissions.append(unmatched_df)
master_surveyed = pd.concat(master_surveyed)
master_surveyed = master_surveyed[~pd.isnull(master_surveyed[self.STANDARD_LANDLORD_PROPERTY_ID])]
@ -2404,3 +2408,7 @@ class AssetList:
self.standardised_asset_list = self.standardised_asset_list.merge(
self.master_surveyed, how="left", on=self.STANDARD_LANDLORD_PROPERTY_ID
)
# Finally, we keep a record of the unmatched
if unmatched_submissions:
self.unmatched_submissions = pd.concat(unmatched_submissions)

View file

@ -943,5 +943,11 @@ def app():
if not asset_list.outcomes_for_output.empty:
asset_list.outcomes_for_output.to_excel(writer, sheet_name="Outcomes", index=False)
if not asset_list.unmatched_submissions.empty:
asset_list.unmatched_submissions.to_excel(writer, sheet_name="Unmatched Submissions", index=False)
if not asset_list.outcomes_no_match.empty:
asset_list.outcomes_no_match.to_excel(writer, sheet_name="Unmatched Outcomes", index=False)
# Store the Hubspot export as a csv
hubspot_data.to_csv(os.path.join(data_folder, "Hubspot Export.csv"), index=False)