mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
attempting to match masters
This commit is contained in:
parent
66e0fdea28
commit
831abc884f
3 changed files with 143 additions and 0 deletions
|
|
@ -7,6 +7,7 @@ from datetime import datetime
|
|||
from openai import OpenAI
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
from fuzzywuzzy import process
|
||||
from utils.logger import setup_logger
|
||||
from backend.SearchEpc import SearchEpc
|
||||
|
|
@ -351,6 +352,9 @@ class AssetList:
|
|||
self.duplicated_addresses = None
|
||||
self.contact_details = None
|
||||
self.contact_detail_fields = None
|
||||
self.outcomes = None
|
||||
self.outcomes_no_match = None
|
||||
self.master_surveyed = None
|
||||
|
||||
# We detect the presence of the non-intrusive columns
|
||||
self.non_intrusives_present = "CIGA Check Required" in self.raw_asset_list.columns
|
||||
|
|
@ -758,6 +762,11 @@ class AssetList:
|
|||
for v in missing_variables:
|
||||
self.standardised_asset_list[v] = None
|
||||
|
||||
# Convert to string
|
||||
self.standardised_asset_list[self.STANDARD_LANDLORD_PROPERTY_ID] = (
|
||||
self.standardised_asset_list[self.STANDARD_LANDLORD_PROPERTY_ID].astype(str)
|
||||
)
|
||||
|
||||
def merge_data(self, df: pd.DataFrame):
|
||||
"""
|
||||
Used to insert data into the standardised asset list, based on the domna property id
|
||||
|
|
@ -1831,3 +1840,120 @@ class AssetList:
|
|||
)
|
||||
|
||||
self.hubspot_data = programme_data
|
||||
|
||||
def flag_outcomes(
|
||||
self,
|
||||
outcomes_filepath,
|
||||
outcomes_sheetname
|
||||
):
|
||||
if outcomes_filepath is None:
|
||||
pass
|
||||
|
||||
# ToDO: Parameterise for future use?
|
||||
self.outcomes = pd.read_excel(outcomes_filepath, sheet_name=outcomes_sheetname)
|
||||
self.outcomes["row_id"] = self.outcomes.index
|
||||
|
||||
logger.info("Matching outcomes to ")
|
||||
# Merge the outcomes onto the asset list - we check we're able to match sufficiently well
|
||||
lookup = []
|
||||
nomatch = []
|
||||
for _, x in tqdm(self.outcomes.iterrows(), total=len(self.outcomes)):
|
||||
address_clean = x["Address"].lower().replace(",", "").replace(" ", " ")
|
||||
|
||||
matched = self.standardised_asset_list[
|
||||
(self.standardised_asset_list[
|
||||
self.STANDARD_FULL_ADDRESS
|
||||
].str.lower().str.replace(",", "").str.replace(" ", " ") == address_clean)
|
||||
]
|
||||
|
||||
if not matched.empty and matched.shape[0] == 1:
|
||||
lookup.append(
|
||||
{
|
||||
"row_id": x["row_id"],
|
||||
self.DOMNA_PROPERTY_ID: matched[self.DOMNA_PROPERTY_ID].values[0]
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
nomatch.append(x["row_id"])
|
||||
|
||||
self.outcomes_no_match = self.outcomes[self.outcomes["row_id"].isin(nomatch)]
|
||||
lookup = pd.DataFrame(lookup)
|
||||
|
||||
# We will have duplicated domna property IDs, where a surveyor has been to a property multiple times
|
||||
# Where we have multiple rows, we want to make a call on what the action should be. For example,
|
||||
# there may be properties that have been visited multiple times where the outcome was "See notes" implying
|
||||
# that the surveyor had a detailed explanation as to why they couldn't gain access so if this has
|
||||
# happened multiple times, in this case we judge that the work may not be viable
|
||||
lookup = lookup.merge(
|
||||
self.outcomes[["row_id", "Outcome", "Notes", "Week Commencing"]], how="left", on="row_id"
|
||||
)
|
||||
|
||||
visit_counts = (
|
||||
lookup.groupby(self.DOMNA_PROPERTY_ID)["row_id"]
|
||||
.count()
|
||||
.reset_index()
|
||||
.rename(columns={"row_id": "visit_count"})
|
||||
.sort_values("visit_count", ascending=False)
|
||||
)
|
||||
|
||||
pivot_df = lookup.groupby(["domna_property_id", "Outcome"]).size().unstack(fill_value=0).reset_index()
|
||||
pivot_df = pivot_df.merge(
|
||||
visit_counts, how="left", on="domna_property_id"
|
||||
)
|
||||
|
||||
# We merge this data onto outcomes
|
||||
self.outcomes["matched_to_asset_list"] = self.outcomes["row_id"].isin(lookup["row_id"].values)
|
||||
self.outcomes = self.outcomes.merge(
|
||||
lookup, how="left", on="row_id"
|
||||
)
|
||||
|
||||
# We merge out pivoted outcomes onto the asset list
|
||||
self.standardised_asset_list = self.standardised_asset_list.merge(
|
||||
pivot_df, how="left", left_on=self.DOMNA_PROPERTY_ID, right_on="domna_property_id"
|
||||
)
|
||||
|
||||
def flag_survey_master(
|
||||
self,
|
||||
master_filepaths
|
||||
):
|
||||
# TODO: This probably needs further expansion
|
||||
|
||||
logger.info("Getting masters and merging onto asset list")
|
||||
master_surveyed = []
|
||||
for filepath in master_filepaths:
|
||||
master_data = pd.read_csv(filepath)
|
||||
# Strip columns
|
||||
master_data.columns = [c.strip() for c in master_data.columns]
|
||||
|
||||
install_col = (
|
||||
"INSTALLED OR CANCELLED" if "INSTALLED OR CANCELLED" in master_data.columns
|
||||
else "INSTALL / CANCELLATION DATE"
|
||||
)
|
||||
|
||||
# We just need to check if any were cancelled
|
||||
master_to_append = master_data[
|
||||
["UPRN", install_col, "SUBMISSION DATE"]
|
||||
].rename(columns={"UPRN": self.STANDARD_LANDLORD_PROPERTY_ID, install_col: "survey_status"})
|
||||
master_to_append["cancelled"] = master_to_append["survey_status"].str.lower().str.contains("cancel")
|
||||
|
||||
master_surveyed.append(master_to_append)
|
||||
|
||||
master_surveyed = pd.concat(master_surveyed)
|
||||
master_surveyed = master_surveyed[~pd.isnull(master_surveyed[self.STANDARD_LANDLORD_PROPERTY_ID])]
|
||||
master_surveyed = master_surveyed[
|
||||
~master_surveyed[self.STANDARD_LANDLORD_PROPERTY_ID].isin(
|
||||
["NOT ON ASSET LIST", "Missing From Asset List"]
|
||||
)
|
||||
]
|
||||
|
||||
master_surveyed[self.STANDARD_LANDLORD_PROPERTY_ID] = master_surveyed[
|
||||
self.STANDARD_LANDLORD_PROPERTY_ID
|
||||
].astype(str)
|
||||
|
||||
# We de-dupe crudely on landlord property id
|
||||
self.master_surveyed = master_surveyed.drop_duplicates(subset=[self.STANDARD_LANDLORD_PROPERTY_ID])
|
||||
|
||||
self.standardised_asset_list = self.standardised_asset_list.merge(
|
||||
self.master_surveyed, how="left", on=self.STANDARD_LANDLORD_PROPERTY_ID
|
||||
)
|
||||
|
|
|
|||
|
|
@ -299,6 +299,9 @@ def app():
|
|||
landlord_wall_construction = "Wall Constuction"
|
||||
landlord_heating_system = "Heating"
|
||||
landlord_existing_pv = None
|
||||
outcomes_filename = "ACIS Group - 25.11.2024 - outcomes.xlsx"
|
||||
master_filename_eco3 = "ECO 3 -Table 1.csv"
|
||||
master_filename_eco4 = "ECO 4 -Table 1.csv"
|
||||
|
||||
# Maps addresses to uprn in problematic cases
|
||||
manual_uprn_map = {}
|
||||
|
|
@ -354,6 +357,18 @@ def app():
|
|||
|
||||
asset_list.apply_standardiation()
|
||||
|
||||
# We now flag properties that have been treated under existing programmes
|
||||
asset_list.flag_outcomes(
|
||||
outcomes_filepath=os.path.join(data_folder, outcomes_filename),
|
||||
outcomes_sheetname="Feedback"
|
||||
)
|
||||
|
||||
asset_list.flag_survey_master(
|
||||
master_filepaths=[
|
||||
os.path.join(data_folder, f) for f in [master_filename_eco3, master_filename_eco4] if f is not None
|
||||
],
|
||||
)
|
||||
|
||||
### We retrieve the EPC data
|
||||
|
||||
# We chunk up this data into 5000 rows at a time
|
||||
|
|
@ -497,6 +512,7 @@ def app():
|
|||
cleaned = msgpack.unpackb(cleaned, raw=False)
|
||||
|
||||
# TODO: We should break out the identification of work types to flag blocks of flats specifically
|
||||
# TODO: Append existing outcomes onto the sheet.
|
||||
asset_list.identify_worktypes(cleaned)
|
||||
|
||||
pprint(asset_list.work_type_figures)
|
||||
|
|
|
|||
|
|
@ -96,6 +96,7 @@ def download_data_from_sharepoint():
|
|||
folder for folder in contents["value"] if folder["name"] in folders_to_keep
|
||||
]
|
||||
for folder_to_pull in folders_to_pull:
|
||||
|
||||
# Get the contents
|
||||
folder_contents = sharepoint_client.list_folder_contents(
|
||||
drive_id=sharepoint_client.document_drive["id"],
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue