attempting to match masters

2026-07-27 23:35:01 +00:00 · 2025-03-08 15:38:05 +00:00 · 2025-03-08 15:38:05 +00:00 · 831abc884f
commit 831abc884f
parent 66e0fdea28
3 changed files with 143 additions and 0 deletions
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@ -7,6 +7,7 @@ from datetime import datetime
 from openai import OpenAI
 import numpy as np
 import pandas as pd
+from tqdm import tqdm
 from fuzzywuzzy import process
 from utils.logger import setup_logger
 from backend.SearchEpc import SearchEpc
@ -351,6 +352,9 @@ class AssetList:
        self.duplicated_addresses = None
        self.contact_details = None
        self.contact_detail_fields = None
+        self.outcomes = None
+        self.outcomes_no_match = None
+        self.master_surveyed = None

        # We detect the presence of the non-intrusive columns
        self.non_intrusives_present = "CIGA Check Required" in self.raw_asset_list.columns
@ -758,6 +762,11 @@ class AssetList:
        for v in missing_variables:
            self.standardised_asset_list[v] = None

+        # Convert to string
+        self.standardised_asset_list[self.STANDARD_LANDLORD_PROPERTY_ID] = (
+            self.standardised_asset_list[self.STANDARD_LANDLORD_PROPERTY_ID].astype(str)
+        )
+
    def merge_data(self, df: pd.DataFrame):
        """
        Used to insert data into the standardised asset list, based on the domna property id
@ -1831,3 +1840,120 @@ class AssetList:
        )

        self.hubspot_data = programme_data
+
+    def flag_outcomes(
+        self,
+        outcomes_filepath,
+        outcomes_sheetname
+    ):
+        if outcomes_filepath is None:
+            pass
+
+        # ToDO: Parameterise for future use?
+        self.outcomes = pd.read_excel(outcomes_filepath, sheet_name=outcomes_sheetname)
+        self.outcomes["row_id"] = self.outcomes.index
+
+        logger.info("Matching outcomes to ")
+        # Merge the outcomes onto the asset list - we check we're able to match sufficiently well
+        lookup = []
+        nomatch = []
+        for _, x in tqdm(self.outcomes.iterrows(), total=len(self.outcomes)):
+            address_clean = x["Address"].lower().replace(",", "").replace("  ", " ")
+
+            matched = self.standardised_asset_list[
+                (self.standardised_asset_list[
+                     self.STANDARD_FULL_ADDRESS
+                 ].str.lower().str.replace(",", "").str.replace("  ", " ") == address_clean)
+            ]
+
+            if not matched.empty and matched.shape[0] == 1:
+                lookup.append(
+                    {
+                        "row_id": x["row_id"],
+                        self.DOMNA_PROPERTY_ID: matched[self.DOMNA_PROPERTY_ID].values[0]
+                    }
+                )
+                continue
+
+            nomatch.append(x["row_id"])
+
+        self.outcomes_no_match = self.outcomes[self.outcomes["row_id"].isin(nomatch)]
+        lookup = pd.DataFrame(lookup)
+
+        # We will have duplicated domna property IDs, where a surveyor has been to a property multiple times
+        # Where we have multiple rows, we want to make a call on what the action should be. For example,
+        # there may be properties that have been visited multiple times where the outcome was "See notes" implying
+        # that the surveyor had a detailed explanation as to why they couldn't gain access so if this has
+        # happened multiple times, in this case we judge that the work may not be viable
+        lookup = lookup.merge(
+            self.outcomes[["row_id", "Outcome", "Notes", "Week Commencing"]], how="left", on="row_id"
+        )
+
+        visit_counts = (
+            lookup.groupby(self.DOMNA_PROPERTY_ID)["row_id"]
+            .count()
+            .reset_index()
+            .rename(columns={"row_id": "visit_count"})
+            .sort_values("visit_count", ascending=False)
+        )
+
+        pivot_df = lookup.groupby(["domna_property_id", "Outcome"]).size().unstack(fill_value=0).reset_index()
+        pivot_df = pivot_df.merge(
+            visit_counts, how="left", on="domna_property_id"
+        )
+
+        # We merge this data onto outcomes
+        self.outcomes["matched_to_asset_list"] = self.outcomes["row_id"].isin(lookup["row_id"].values)
+        self.outcomes = self.outcomes.merge(
+            lookup, how="left", on="row_id"
+        )
+
+        # We merge out pivoted outcomes onto the asset list
+        self.standardised_asset_list = self.standardised_asset_list.merge(
+            pivot_df, how="left", left_on=self.DOMNA_PROPERTY_ID, right_on="domna_property_id"
+        )
+
+    def flag_survey_master(
+        self,
+        master_filepaths
+    ):
+        # TODO: This probably needs further expansion
+
+        logger.info("Getting masters and merging onto asset list")
+        master_surveyed = []
+        for filepath in master_filepaths:
+            master_data = pd.read_csv(filepath)
+            # Strip columns
+            master_data.columns = [c.strip() for c in master_data.columns]
+
+            install_col = (
+                "INSTALLED OR CANCELLED" if "INSTALLED OR CANCELLED" in master_data.columns
+                else "INSTALL / CANCELLATION DATE"
+            )
+
+            # We just need to check if any were cancelled
+            master_to_append = master_data[
+                ["UPRN", install_col, "SUBMISSION DATE"]
+            ].rename(columns={"UPRN": self.STANDARD_LANDLORD_PROPERTY_ID, install_col: "survey_status"})
+            master_to_append["cancelled"] = master_to_append["survey_status"].str.lower().str.contains("cancel")
+
+            master_surveyed.append(master_to_append)
+
+        master_surveyed = pd.concat(master_surveyed)
+        master_surveyed = master_surveyed[~pd.isnull(master_surveyed[self.STANDARD_LANDLORD_PROPERTY_ID])]
+        master_surveyed = master_surveyed[
+            ~master_surveyed[self.STANDARD_LANDLORD_PROPERTY_ID].isin(
+                ["NOT ON ASSET LIST", "Missing From Asset List"]
+            )
+        ]
+
+        master_surveyed[self.STANDARD_LANDLORD_PROPERTY_ID] = master_surveyed[
+            self.STANDARD_LANDLORD_PROPERTY_ID
+        ].astype(str)
+
+        # We de-dupe crudely on landlord property id
+        self.master_surveyed = master_surveyed.drop_duplicates(subset=[self.STANDARD_LANDLORD_PROPERTY_ID])
+
+        self.standardised_asset_list = self.standardised_asset_list.merge(
+            self.master_surveyed, how="left", on=self.STANDARD_LANDLORD_PROPERTY_ID
+        )
--- a/asset_list/app.py
+++ b/asset_list/app.py
@ -299,6 +299,9 @@ def app():
    landlord_wall_construction = "Wall Constuction"
    landlord_heating_system = "Heating"
    landlord_existing_pv = None
+    outcomes_filename = "ACIS Group - 25.11.2024 - outcomes.xlsx"
+    master_filename_eco3 = "ECO 3 -Table 1.csv"
+    master_filename_eco4 = "ECO 4 -Table 1.csv"

    # Maps addresses to uprn in problematic cases
    manual_uprn_map = {}
@ -354,6 +357,18 @@ def app():

    asset_list.apply_standardiation()

+    # We now flag properties that have been treated under existing programmes
+    asset_list.flag_outcomes(
+        outcomes_filepath=os.path.join(data_folder, outcomes_filename),
+        outcomes_sheetname="Feedback"
+    )
+
+    asset_list.flag_survey_master(
+        master_filepaths=[
+            os.path.join(data_folder, f) for f in [master_filename_eco3, master_filename_eco4] if f is not None
+        ],
+    )
+
    ### We retrieve the EPC data

    # We chunk up this data into 5000 rows at a time
@ -497,6 +512,7 @@ def app():
    cleaned = msgpack.unpackb(cleaned, raw=False)

    # TODO: We should break out the identification of work types to flag blocks of flats specifically
+    # TODO: Append existing outcomes onto the sheet.
    asset_list.identify_worktypes(cleaned)

    pprint(asset_list.work_type_figures)
--- a/etl/customers/stonewater/data_cleaning.py
+++ b/etl/customers/stonewater/data_cleaning.py
@ -96,6 +96,7 @@ def download_data_from_sharepoint():
        folder for folder in contents["value"] if folder["name"] in folders_to_keep
    ]
    for folder_to_pull in folders_to_pull:
+        
        # Get the contents
        folder_contents = sharepoint_client.list_folder_contents(
            drive_id=sharepoint_client.document_drive["id"],