attempting to match masters

This commit is contained in:
Khalim Conn-Kowlessar 2025-03-08 15:38:05 +00:00
parent 66e0fdea28
commit 831abc884f
3 changed files with 143 additions and 0 deletions

View file

@ -7,6 +7,7 @@ from datetime import datetime
from openai import OpenAI
import numpy as np
import pandas as pd
from tqdm import tqdm
from fuzzywuzzy import process
from utils.logger import setup_logger
from backend.SearchEpc import SearchEpc
@ -351,6 +352,9 @@ class AssetList:
self.duplicated_addresses = None
self.contact_details = None
self.contact_detail_fields = None
self.outcomes = None
self.outcomes_no_match = None
self.master_surveyed = None
# We detect the presence of the non-intrusive columns
self.non_intrusives_present = "CIGA Check Required" in self.raw_asset_list.columns
@ -758,6 +762,11 @@ class AssetList:
for v in missing_variables:
self.standardised_asset_list[v] = None
# Convert to string
self.standardised_asset_list[self.STANDARD_LANDLORD_PROPERTY_ID] = (
self.standardised_asset_list[self.STANDARD_LANDLORD_PROPERTY_ID].astype(str)
)
def merge_data(self, df: pd.DataFrame):
"""
Used to insert data into the standardised asset list, based on the domna property id
@ -1831,3 +1840,120 @@ class AssetList:
)
self.hubspot_data = programme_data
def flag_outcomes(
self,
outcomes_filepath,
outcomes_sheetname
):
if outcomes_filepath is None:
pass
# ToDO: Parameterise for future use?
self.outcomes = pd.read_excel(outcomes_filepath, sheet_name=outcomes_sheetname)
self.outcomes["row_id"] = self.outcomes.index
logger.info("Matching outcomes to ")
# Merge the outcomes onto the asset list - we check we're able to match sufficiently well
lookup = []
nomatch = []
for _, x in tqdm(self.outcomes.iterrows(), total=len(self.outcomes)):
address_clean = x["Address"].lower().replace(",", "").replace(" ", " ")
matched = self.standardised_asset_list[
(self.standardised_asset_list[
self.STANDARD_FULL_ADDRESS
].str.lower().str.replace(",", "").str.replace(" ", " ") == address_clean)
]
if not matched.empty and matched.shape[0] == 1:
lookup.append(
{
"row_id": x["row_id"],
self.DOMNA_PROPERTY_ID: matched[self.DOMNA_PROPERTY_ID].values[0]
}
)
continue
nomatch.append(x["row_id"])
self.outcomes_no_match = self.outcomes[self.outcomes["row_id"].isin(nomatch)]
lookup = pd.DataFrame(lookup)
# We will have duplicated domna property IDs, where a surveyor has been to a property multiple times
# Where we have multiple rows, we want to make a call on what the action should be. For example,
# there may be properties that have been visited multiple times where the outcome was "See notes" implying
# that the surveyor had a detailed explanation as to why they couldn't gain access so if this has
# happened multiple times, in this case we judge that the work may not be viable
lookup = lookup.merge(
self.outcomes[["row_id", "Outcome", "Notes", "Week Commencing"]], how="left", on="row_id"
)
visit_counts = (
lookup.groupby(self.DOMNA_PROPERTY_ID)["row_id"]
.count()
.reset_index()
.rename(columns={"row_id": "visit_count"})
.sort_values("visit_count", ascending=False)
)
pivot_df = lookup.groupby(["domna_property_id", "Outcome"]).size().unstack(fill_value=0).reset_index()
pivot_df = pivot_df.merge(
visit_counts, how="left", on="domna_property_id"
)
# We merge this data onto outcomes
self.outcomes["matched_to_asset_list"] = self.outcomes["row_id"].isin(lookup["row_id"].values)
self.outcomes = self.outcomes.merge(
lookup, how="left", on="row_id"
)
# We merge out pivoted outcomes onto the asset list
self.standardised_asset_list = self.standardised_asset_list.merge(
pivot_df, how="left", left_on=self.DOMNA_PROPERTY_ID, right_on="domna_property_id"
)
def flag_survey_master(
self,
master_filepaths
):
# TODO: This probably needs further expansion
logger.info("Getting masters and merging onto asset list")
master_surveyed = []
for filepath in master_filepaths:
master_data = pd.read_csv(filepath)
# Strip columns
master_data.columns = [c.strip() for c in master_data.columns]
install_col = (
"INSTALLED OR CANCELLED" if "INSTALLED OR CANCELLED" in master_data.columns
else "INSTALL / CANCELLATION DATE"
)
# We just need to check if any were cancelled
master_to_append = master_data[
["UPRN", install_col, "SUBMISSION DATE"]
].rename(columns={"UPRN": self.STANDARD_LANDLORD_PROPERTY_ID, install_col: "survey_status"})
master_to_append["cancelled"] = master_to_append["survey_status"].str.lower().str.contains("cancel")
master_surveyed.append(master_to_append)
master_surveyed = pd.concat(master_surveyed)
master_surveyed = master_surveyed[~pd.isnull(master_surveyed[self.STANDARD_LANDLORD_PROPERTY_ID])]
master_surveyed = master_surveyed[
~master_surveyed[self.STANDARD_LANDLORD_PROPERTY_ID].isin(
["NOT ON ASSET LIST", "Missing From Asset List"]
)
]
master_surveyed[self.STANDARD_LANDLORD_PROPERTY_ID] = master_surveyed[
self.STANDARD_LANDLORD_PROPERTY_ID
].astype(str)
# We de-dupe crudely on landlord property id
self.master_surveyed = master_surveyed.drop_duplicates(subset=[self.STANDARD_LANDLORD_PROPERTY_ID])
self.standardised_asset_list = self.standardised_asset_list.merge(
self.master_surveyed, how="left", on=self.STANDARD_LANDLORD_PROPERTY_ID
)

View file

@ -299,6 +299,9 @@ def app():
landlord_wall_construction = "Wall Constuction"
landlord_heating_system = "Heating"
landlord_existing_pv = None
outcomes_filename = "ACIS Group - 25.11.2024 - outcomes.xlsx"
master_filename_eco3 = "ECO 3 -Table 1.csv"
master_filename_eco4 = "ECO 4 -Table 1.csv"
# Maps addresses to uprn in problematic cases
manual_uprn_map = {}
@ -354,6 +357,18 @@ def app():
asset_list.apply_standardiation()
# We now flag properties that have been treated under existing programmes
asset_list.flag_outcomes(
outcomes_filepath=os.path.join(data_folder, outcomes_filename),
outcomes_sheetname="Feedback"
)
asset_list.flag_survey_master(
master_filepaths=[
os.path.join(data_folder, f) for f in [master_filename_eco3, master_filename_eco4] if f is not None
],
)
### We retrieve the EPC data
# We chunk up this data into 5000 rows at a time
@ -497,6 +512,7 @@ def app():
cleaned = msgpack.unpackb(cleaned, raw=False)
# TODO: We should break out the identification of work types to flag blocks of flats specifically
# TODO: Append existing outcomes onto the sheet.
asset_list.identify_worktypes(cleaned)
pprint(asset_list.work_type_figures)

View file

@ -96,6 +96,7 @@ def download_data_from_sharepoint():
folder for folder in contents["value"] if folder["name"] in folders_to_keep
]
for folder_to_pull in folders_to_pull:
# Get the contents
folder_contents = sharepoint_client.list_folder_contents(
drive_id=sharepoint_client.document_drive["id"],