diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index e0bb73f4..3007269b 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -7,6 +7,7 @@ from datetime import datetime from openai import OpenAI import numpy as np import pandas as pd +from tqdm import tqdm from fuzzywuzzy import process from utils.logger import setup_logger from backend.SearchEpc import SearchEpc @@ -351,6 +352,9 @@ class AssetList: self.duplicated_addresses = None self.contact_details = None self.contact_detail_fields = None + self.outcomes = None + self.outcomes_no_match = None + self.master_surveyed = None # We detect the presence of the non-intrusive columns self.non_intrusives_present = "CIGA Check Required" in self.raw_asset_list.columns @@ -758,6 +762,11 @@ class AssetList: for v in missing_variables: self.standardised_asset_list[v] = None + # Convert to string + self.standardised_asset_list[self.STANDARD_LANDLORD_PROPERTY_ID] = ( + self.standardised_asset_list[self.STANDARD_LANDLORD_PROPERTY_ID].astype(str) + ) + def merge_data(self, df: pd.DataFrame): """ Used to insert data into the standardised asset list, based on the domna property id @@ -1831,3 +1840,120 @@ class AssetList: ) self.hubspot_data = programme_data + + def flag_outcomes( + self, + outcomes_filepath, + outcomes_sheetname + ): + if outcomes_filepath is None: + pass + + # ToDO: Parameterise for future use? + self.outcomes = pd.read_excel(outcomes_filepath, sheet_name=outcomes_sheetname) + self.outcomes["row_id"] = self.outcomes.index + + logger.info("Matching outcomes to ") + # Merge the outcomes onto the asset list - we check we're able to match sufficiently well + lookup = [] + nomatch = [] + for _, x in tqdm(self.outcomes.iterrows(), total=len(self.outcomes)): + address_clean = x["Address"].lower().replace(",", "").replace(" ", " ") + + matched = self.standardised_asset_list[ + (self.standardised_asset_list[ + self.STANDARD_FULL_ADDRESS + ].str.lower().str.replace(",", "").str.replace(" ", " ") == address_clean) + ] + + if not matched.empty and matched.shape[0] == 1: + lookup.append( + { + "row_id": x["row_id"], + self.DOMNA_PROPERTY_ID: matched[self.DOMNA_PROPERTY_ID].values[0] + } + ) + continue + + nomatch.append(x["row_id"]) + + self.outcomes_no_match = self.outcomes[self.outcomes["row_id"].isin(nomatch)] + lookup = pd.DataFrame(lookup) + + # We will have duplicated domna property IDs, where a surveyor has been to a property multiple times + # Where we have multiple rows, we want to make a call on what the action should be. For example, + # there may be properties that have been visited multiple times where the outcome was "See notes" implying + # that the surveyor had a detailed explanation as to why they couldn't gain access so if this has + # happened multiple times, in this case we judge that the work may not be viable + lookup = lookup.merge( + self.outcomes[["row_id", "Outcome", "Notes", "Week Commencing"]], how="left", on="row_id" + ) + + visit_counts = ( + lookup.groupby(self.DOMNA_PROPERTY_ID)["row_id"] + .count() + .reset_index() + .rename(columns={"row_id": "visit_count"}) + .sort_values("visit_count", ascending=False) + ) + + pivot_df = lookup.groupby(["domna_property_id", "Outcome"]).size().unstack(fill_value=0).reset_index() + pivot_df = pivot_df.merge( + visit_counts, how="left", on="domna_property_id" + ) + + # We merge this data onto outcomes + self.outcomes["matched_to_asset_list"] = self.outcomes["row_id"].isin(lookup["row_id"].values) + self.outcomes = self.outcomes.merge( + lookup, how="left", on="row_id" + ) + + # We merge out pivoted outcomes onto the asset list + self.standardised_asset_list = self.standardised_asset_list.merge( + pivot_df, how="left", left_on=self.DOMNA_PROPERTY_ID, right_on="domna_property_id" + ) + + def flag_survey_master( + self, + master_filepaths + ): + # TODO: This probably needs further expansion + + logger.info("Getting masters and merging onto asset list") + master_surveyed = [] + for filepath in master_filepaths: + master_data = pd.read_csv(filepath) + # Strip columns + master_data.columns = [c.strip() for c in master_data.columns] + + install_col = ( + "INSTALLED OR CANCELLED" if "INSTALLED OR CANCELLED" in master_data.columns + else "INSTALL / CANCELLATION DATE" + ) + + # We just need to check if any were cancelled + master_to_append = master_data[ + ["UPRN", install_col, "SUBMISSION DATE"] + ].rename(columns={"UPRN": self.STANDARD_LANDLORD_PROPERTY_ID, install_col: "survey_status"}) + master_to_append["cancelled"] = master_to_append["survey_status"].str.lower().str.contains("cancel") + + master_surveyed.append(master_to_append) + + master_surveyed = pd.concat(master_surveyed) + master_surveyed = master_surveyed[~pd.isnull(master_surveyed[self.STANDARD_LANDLORD_PROPERTY_ID])] + master_surveyed = master_surveyed[ + ~master_surveyed[self.STANDARD_LANDLORD_PROPERTY_ID].isin( + ["NOT ON ASSET LIST", "Missing From Asset List"] + ) + ] + + master_surveyed[self.STANDARD_LANDLORD_PROPERTY_ID] = master_surveyed[ + self.STANDARD_LANDLORD_PROPERTY_ID + ].astype(str) + + # We de-dupe crudely on landlord property id + self.master_surveyed = master_surveyed.drop_duplicates(subset=[self.STANDARD_LANDLORD_PROPERTY_ID]) + + self.standardised_asset_list = self.standardised_asset_list.merge( + self.master_surveyed, how="left", on=self.STANDARD_LANDLORD_PROPERTY_ID + ) diff --git a/asset_list/app.py b/asset_list/app.py index 7275709d..8e2df56d 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -299,6 +299,9 @@ def app(): landlord_wall_construction = "Wall Constuction" landlord_heating_system = "Heating" landlord_existing_pv = None + outcomes_filename = "ACIS Group - 25.11.2024 - outcomes.xlsx" + master_filename_eco3 = "ECO 3 -Table 1.csv" + master_filename_eco4 = "ECO 4 -Table 1.csv" # Maps addresses to uprn in problematic cases manual_uprn_map = {} @@ -354,6 +357,18 @@ def app(): asset_list.apply_standardiation() + # We now flag properties that have been treated under existing programmes + asset_list.flag_outcomes( + outcomes_filepath=os.path.join(data_folder, outcomes_filename), + outcomes_sheetname="Feedback" + ) + + asset_list.flag_survey_master( + master_filepaths=[ + os.path.join(data_folder, f) for f in [master_filename_eco3, master_filename_eco4] if f is not None + ], + ) + ### We retrieve the EPC data # We chunk up this data into 5000 rows at a time @@ -497,6 +512,7 @@ def app(): cleaned = msgpack.unpackb(cleaned, raw=False) # TODO: We should break out the identification of work types to flag blocks of flats specifically + # TODO: Append existing outcomes onto the sheet. asset_list.identify_worktypes(cleaned) pprint(asset_list.work_type_figures) diff --git a/etl/customers/stonewater/data_cleaning.py b/etl/customers/stonewater/data_cleaning.py index a5da0c79..eedae9b9 100644 --- a/etl/customers/stonewater/data_cleaning.py +++ b/etl/customers/stonewater/data_cleaning.py @@ -96,6 +96,7 @@ def download_data_from_sharepoint(): folder for folder in contents["value"] if folder["name"] in folders_to_keep ] for folder_to_pull in folders_to_pull: + # Get the contents folder_contents = sharepoint_client.list_folder_contents( drive_id=sharepoint_client.document_drive["id"],