From 2cf0fcc2b13d0d7e9178191d7719585e9bedac87 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 6 May 2025 19:59:16 +0100 Subject: [PATCH 1/6] wtf --- backend/engine/engine.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/engine/engine.py b/backend/engine/engine.py index 408d044e..9c7b867c 100644 --- a/backend/engine/engine.py +++ b/backend/engine/engine.py @@ -1033,6 +1033,7 @@ async def model_engine(body: PlanTriggerRequest): # Commit final changes session.commit() + print("Hello world") except IntegrityError: logger.error("Database integrity error occurred", exc_info=True) From 9b869063d155b9d83bce4fe423deedefa38f20e5 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 6 May 2025 19:59:33 +0100 Subject: [PATCH 2/6] Jun-te was right --- backend/engine/engine.py | 1 - 1 file changed, 1 deletion(-) diff --git a/backend/engine/engine.py b/backend/engine/engine.py index 9c7b867c..408d044e 100644 --- a/backend/engine/engine.py +++ b/backend/engine/engine.py @@ -1033,7 +1033,6 @@ async def model_engine(body: PlanTriggerRequest): # Commit final changes session.commit() - print("Hello world") except IntegrityError: logger.error("Database integrity error occurred", exc_info=True) From 96fb10390b6526ff577d49b2dd9010b31580709d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 7 May 2025 15:09:58 +0100 Subject: [PATCH 3/6] Working on LHP asset list review --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- asset_list/AssetList.py | 271 ++++++++++++++++--------- asset_list/app.py | 105 +++++++++- asset_list/mappings/heating_systems.py | 45 +++- asset_list/mappings/outcomes.py | 231 +++++++++++++++++++++ asset_list/mappings/property_type.py | 14 +- 7 files changed, 570 insertions(+), 100 deletions(-) create mode 100644 asset_list/mappings/outcomes.py diff --git a/.idea/Model.iml b/.idea/Model.iml index df6c4faa..96ad7a95 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 50cad4ca..fb10c6b0 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 18e202b6..4586ae57 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -19,6 +19,7 @@ import asset_list.mappings.heating_systems as heating_mappings import asset_list.mappings.exising_pv as existing_pv_mappings import asset_list.mappings.built_form as built_form_mappings import asset_list.mappings.roof as roof_mappings +import asset_list.mappings.outcomes as outcomes_mappings from recommendations.recommendation_utils import ( estimate_perimeter, @@ -1139,21 +1140,29 @@ class AssetList: # We add a SAP category for all work type identification self.standardised_asset_list["SAP Category"] = np.where( ( - (self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= 68) | - (self.standardised_asset_list[self.STANDARD_SAP] <= 68) + (self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= 54) | + (self.standardised_asset_list[self.STANDARD_SAP] <= 54) ), - "SAP Rating 68 or less", + "SAP Rating 54 or less", np.where( ( - ( - self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= - self.EMPTY_CAVITY_SAP_THRESHOLD - ) | (self.standardised_asset_list[self.STANDARD_SAP] <= self.EMPTY_CAVITY_SAP_THRESHOLD) + (self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= 68) | + (self.standardised_asset_list[self.STANDARD_SAP] <= 68) + ), + "SAP Rating 55-68", + np.where( + ( + ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= + self.EMPTY_CAVITY_SAP_THRESHOLD + ) | (self.standardised_asset_list[self.STANDARD_SAP] <= self.EMPTY_CAVITY_SAP_THRESHOLD) + ), + f"SAP Rating 69-{self.EMPTY_CAVITY_SAP_THRESHOLD}", + f"SAP Rating {self.EMPTY_CAVITY_SAP_THRESHOLD + 1} or more" ), - f"SAP Rating 69-{self.EMPTY_CAVITY_SAP_THRESHOLD}", - f"SAP Rating {self.EMPTY_CAVITY_SAP_THRESHOLD + 1} or more" ) ) + else: # We add a SAP category for all work type identification # We break into 4 categories (54 or less, 55-68, 69-74, 75 or more) @@ -1724,8 +1733,8 @@ class AssetList: ~self.standardised_asset_list["epc_indicates_empty_cavity"] & pd.isnull(self.standardised_asset_list["cavity_reason"]) ), - "Landlord Data Shows Empty Cavity, EPC & Inspections Shows Filled: " + self.standardised_asset_list[ - "SAP Category"], + "Landlord Data Shows Empty Cavity, EPC & Inspections Shows Filled or Non-cavity: " + + self.standardised_asset_list["SAP Category"], self.standardised_asset_list["cavity_reason"] ) @@ -2172,10 +2181,7 @@ class AssetList: # TODO: Fetch from Sharepoint ecosurv_filepath = "/Users/khalimconn-kowlessar/Documents/hestia/Ecosurv/15.04.csv" logger.info("Getting Ecosurv data from %s", ecosurv_filepath) - self.ecosurv = pd.read_csv( - ecosurv_filepath, - encoding="cp437" - ) + self.ecosurv = pd.read_csv(ecosurv_filepath, encoding="cp437") landlords = self.ecosurv["Landlord"].value_counts().reset_index(drop=False) landlord_references = landlords[ @@ -2260,46 +2266,82 @@ class AssetList: def flag_outcomes( self, - outcomes_filepath, + outcomes_filepaths, outcomes_sheetname, outcomes_address, outcomes_postcode, outcomes_houseno, outcomes_id ): - if outcomes_filepath is None: + if not outcomes_filepaths: return - self.outcomes = pd.read_excel(outcomes_filepath, sheet_name=outcomes_sheetname) - self.outcomes["row_id"] = self.outcomes.index - - if outcomes_houseno is None: - outcomes_houseno = "houseno" - self.outcomes["houseno"] = self.outcomes[outcomes_address].apply( - lambda x: SearchEpc.get_house_number(x, self.outcomes[outcomes_postcode]) - ) - - logger.info("Matching outcomes to asset list") - # Merge the outcomes onto the asset list - we check we're able to match sufficiently well + self.outcomes = [] + outcomes_no_match = [] lookup = [] - nomatch = [] - for _, x in tqdm(self.outcomes.iterrows(), total=len(self.outcomes)): + for idx, outcomes_filepath in enumerate(outcomes_filepaths): + outcomes = pd.read_excel(outcomes_filepath, sheet_name=outcomes_sheetname[idx]) + outcomes["row_id"] = outcomes.index - if pd.isnull(x[outcomes_address]): - continue + if outcomes_houseno[idx] is None: + outcomes_houseno = "houseno" + outcomes["houseno"] = outcomes[outcomes_address[idx]].apply( + lambda x: SearchEpc.get_house_number(x, outcomes[outcomes_postcode]) + ) - # Check if we have an id - oid = x[outcomes_id] if outcomes_id is not None else None + # We handle an edge case that occured for LHP + if "Notes / Outcomes" in outcomes.columns and "Outcome" not in outcomes.columns: + # We use the re-mapper to handle this: + outcomes["Notes / Outcomes"] = outcomes["Notes / Outcomes"].str.strip() + values_to_remap = outcomes["Notes / Outcomes"].unique() + # We want to map this to our standardised list of property types we're interested in + remapper = DataRemapper( + standard_values=outcomes_mappings.outcomes_values, standard_map=outcomes_mappings.outcomes_map + ) + remap_dictionary = remapper.standardize_list(values_to_remap=values_to_remap.tolist()) + # Perform the remap + outcomes["Outcome"] = outcomes["Notes / Outcomes"].map(remap_dictionary) + + outcomes["Outcome"] = outcomes["Outcome"].str.lower() + + logger.info("Matching outcomes to asset list") + # Merge the outcomes onto the asset list - we check we're able to match sufficiently well + lookup_i = [] + nomatch_i = [] + for _, x in tqdm(outcomes.iterrows(), total=len(outcomes)): + + if pd.isnull(x[outcomes_address[idx]]): + continue + + # Check if we have an id + oid = x[outcomes_id[idx]] if outcomes_id[idx] is not None else None + + if oid is not None: + matched = self.standardised_asset_list[ + (self.standardised_asset_list[ + self.STANDARD_LANDLORD_PROPERTY_ID + ].str.strip() == oid) + ] + + if matched.shape[0] == 1: + lookup_i.append( + { + "row_id": x["row_id"], + self.DOMNA_PROPERTY_ID: matched[self.DOMNA_PROPERTY_ID].values[0] + } + ) + continue + + address_clean = x[outcomes_address[idx]].lower().replace(",", "").replace(" ", " ") - if oid is not None: matched = self.standardised_asset_list[ (self.standardised_asset_list[ - self.STANDARD_LANDLORD_PROPERTY_ID - ].str.strip() == oid) + self.STANDARD_FULL_ADDRESS + ].str.lower().str.replace(",", "").str.replace(" ", " ") == address_clean) ] if matched.shape[0] == 1: - lookup.append( + lookup_i.append( { "row_id": x["row_id"], self.DOMNA_PROPERTY_ID: matched[self.DOMNA_PROPERTY_ID].values[0] @@ -2307,65 +2349,65 @@ class AssetList: ) continue - address_clean = x[outcomes_address].lower().replace(",", "").replace(" ", " ") - - self.outcomes["Outcome"] = self.outcomes["Outcome"].str.lower() - - matched = self.standardised_asset_list[ - (self.standardised_asset_list[ - self.STANDARD_FULL_ADDRESS - ].str.lower().str.replace(",", "").str.replace(" ", " ") == address_clean) - ] - - if matched.shape[0] == 1: - lookup.append( - { - "row_id": x["row_id"], - self.DOMNA_PROPERTY_ID: matched[self.DOMNA_PROPERTY_ID].values[0] - } - ) - continue - - matched = self.standardised_asset_list[ - (self.standardised_asset_list[self.STANDARD_POSTCODE].str.strip() == x[outcomes_postcode]) - ].copy() - if not matched.empty: - matched["houseno"] = matched.apply( - lambda x: SearchEpc.get_house_number( - str(x[self.STANDARD_ADDRESS_1]), str(x[self.STANDARD_POSTCODE]) - ), - axis=1 - ) - - matched = matched[ - matched["houseno"].astype(str) == str(x[outcomes_houseno]) - ] - if matched.shape[0] == 1: - lookup.append( - { - "row_id": x["row_id"], - self.DOMNA_PROPERTY_ID: matched[self.DOMNA_PROPERTY_ID].values[0] - } + matched = self.standardised_asset_list[ + (self.standardised_asset_list[self.STANDARD_POSTCODE].str.strip() == x[outcomes_postcode[idx]]) + ].copy() + if not matched.empty: + matched["houseno"] = matched.apply( + lambda x: SearchEpc.get_house_number( + str(x[self.STANDARD_ADDRESS_1]), str(x[self.STANDARD_POSTCODE]) + ), + axis=1 ) - continue - elif not matched.empty: - # Use levenstein distance to match - matched["address"] = matched[self.STANDARD_ADDRESS_1] + " " + matched[self.STANDARD_POSTCODE] - best_match = process.extractOne(x["Address"], matched[self.STANDARD_FULL_ADDRESS].values)[0] - matched = matched[matched[self.STANDARD_FULL_ADDRESS] == best_match] - lookup.append( - { - "row_id": x["row_id"], - self.DOMNA_PROPERTY_ID: matched[self.DOMNA_PROPERTY_ID].values[0] - } - ) - continue + if pd.isnull(x[outcomes_houseno[idx]]): + house_no_to_match = SearchEpc.get_house_number( + str(x[outcomes_address[idx]]), str(x[outcomes_postcode[idx]]) + ) + if isinstance(house_no_to_match, str): + house_no_to_match = house_no_to_match.lower() + else: + house_no_to_match = str(x[outcomes_houseno[idx]]).strip() - nomatch.append(x["row_id"]) + matched = matched[matched["houseno"].astype(str) == house_no_to_match] + if matched.shape[0] == 1: + lookup_i.append( + { + "row_id": x["row_id"], + self.DOMNA_PROPERTY_ID: matched[self.DOMNA_PROPERTY_ID].values[0] + } + ) + continue + elif not matched.empty: + # Use levenstein distance to match + matched["address"] = ( + matched[self.STANDARD_ADDRESS_1] + " " + matched[self.STANDARD_POSTCODE] + ) - self.outcomes_no_match = self.outcomes[self.outcomes["row_id"].isin(nomatch)] - lookup = pd.DataFrame(lookup) + best_match = process.extractOne( + x[outcomes_address[idx]], matched[self.STANDARD_FULL_ADDRESS].values + )[0] + matched = matched[matched[self.STANDARD_FULL_ADDRESS] == best_match] + lookup_i.append( + { + "row_id": x["row_id"], + self.DOMNA_PROPERTY_ID: matched[self.DOMNA_PROPERTY_ID].values[0] + } + ) + continue + + nomatch_i.append(x["row_id"]) + + outcomes_no_match_i = outcomes[outcomes["row_id"].isin(nomatch_i)] + lookup_i = pd.DataFrame(lookup_i) + + outcomes_no_match.append(outcomes_no_match_i) + lookup.append(lookup_i) + self.outcomes.append(outcomes) + + lookup = pd.concat(lookup) + outcomes_no_match = pd.concat(outcomes_no_match) + self.outcomes = pd.concat(self.outcomes) if lookup.empty: return @@ -2376,10 +2418,19 @@ class AssetList: # that the surveyor had a detailed explanation as to why they couldn't gain access so if this has # happened multiple times, in this case we judge that the work may not be viable - date_col = "Week Commencing" if "Week Commencing" in self.outcomes else "Survey Date" + if "Week Commencing" in self.outcomes.columns: + date_col = "Week Commencing" + elif "Survey Date" in self.outcomes.columns: + date_col = "Survey Date" + elif "Date letters sent" in self.outcomes.columns: + date_col = "Date letters sent" + else: + raise NotImplementedError("Invalid date in outcomes - implement me") + + notes_col = "Notes" if "Notes" in outcomes.columns else "Notes / Outcomes" lookup = lookup.merge( - self.outcomes[["row_id", "Outcome", "Notes", date_col]], how="left", on="row_id" + self.outcomes[["row_id", "Outcome", notes_col, date_col]], how="left", on="row_id" ) visit_counts = ( @@ -2390,11 +2441,33 @@ class AssetList: .sort_values("visit_count", ascending=False) ) + def extract_date(s): + if isinstance(s, str): + match = re.search(r"(\d{2}\.\d{2}\.\d{4})", s) + if match: + return pd.to_datetime(match.group(1), format="%d.%m.%Y", errors="coerce") + return pd.NaT + + lookup['parsed_date'] = lookup['Date letters sent'].apply(extract_date) + + def get_latest_note(group): + surveyed = group[group['Outcome'] == 'surveyed'] + if not surveyed.empty: + return surveyed.sort_values('parsed_date', ascending=False).iloc[0] + else: + return group.sort_values('parsed_date', ascending=False).iloc[0] + + latest_note = lookup.groupby('domna_property_id', group_keys=False).apply(get_latest_note).reset_index( + drop=True) + latest_note = latest_note[["domna_property_id", notes_col]] + pivot_df = lookup.groupby(["domna_property_id", "Outcome"]).size().unstack(fill_value=0).reset_index() pivot_df = pivot_df.merge( visit_counts, how="left", on="domna_property_id" ) + # We want the latest note + if pivot_df[self.DOMNA_PROPERTY_ID].duplicated().sum(): raise Exception("We have duplicated property IDs in the outcomes data") @@ -2406,6 +2479,14 @@ class AssetList: self.standardised_asset_list = self.standardised_asset_list.merge( pivot_df, how="left", left_on=self.DOMNA_PROPERTY_ID, right_on="domna_property_id" ) + # Merge the latest note + self.standardised_asset_list = self.standardised_asset_list.merge( + latest_note.rename(columns={notes_col: "Latest Route March Note"}), + how="left", left_on=self.DOMNA_PROPERTY_ID, right_on="domna_property_id" + ) + + if self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum(): + raise ValueError("Duplicates appreared - something went wrong") self.outcomes = self.outcomes.sort_values("domna_property_id", ascending=False) diff --git a/asset_list/app.py b/asset_list/app.py index 37e687fc..14322a97 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -89,6 +89,103 @@ def app(): # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) # - Or the insulation required is loft/cavity (floors should be solid) + # LHP: + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/LHP" + data_filename = "LHP.xlsx" + sheet_name = "Decent Homes Stock" + postcode_column = 'Postcode' + fulladdress_column = "Address" + address1_column = None + address1_method = "house_number_extraction" + address_cols_to_concat = [] + missing_postcodes_method = None + landlord_year_built = "Build Date" + landlord_os_uprn = None + landlord_property_type = "Property Type" + landlord_built_form = None + landlord_wall_construction = None + landlord_roof_construction = None + landlord_heating_system = "Heating Type" + landlord_existing_pv = None + landlord_property_id = "Property ID" + landlord_sap = None + outcomes_filename = [ + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/LHP/LHP Outcomes.xlsx", + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/LHP/Lincolnshire Housing Partnership - Outcomes 20th " + "Feb 2024.xlsx", + ] + outcomes_sheetname = ["Sheet1", "LHP"] + outcomes_postcode = ["Postcode", "Postcode"] + outcomes_houseno = ["No.", "No."] + outcomes_id = [None, None] + outcomes_address = ["Address", "Address"] + master_filepaths = [os.path.join(data_folder, "LHP Rolling Master for analysis.csv")] + master_to_asset_list_filepath = None + phase = False + ecosurv_landlords = "lhp" + + # Soverign + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Sovereign" + data_filename = "Warmfront - Quote for CWI.xlsx" + sheet_name = "Sheet2" + postcode_column = 'Postcode' + fulladdress_column = None + address1_column = "Address Line 1" + address1_method = None + address_cols_to_concat = ["Address Line 1", "Address Line 2", "Address Line 3"] + missing_postcodes_method = None + landlord_year_built = None + landlord_os_uprn = None + landlord_property_type = None + landlord_built_form = None + landlord_wall_construction = None + landlord_roof_construction = None + landlord_heating_system = None + landlord_existing_pv = None + landlord_property_id = "ID" + landlord_sap = None + outcomes_filename = None + outcomes_sheetname = None + outcomes_postcode = None + outcomes_houseno = None + outcomes_id = None + outcomes_address = None + master_filepaths = [] + master_to_asset_list_filepath = None + phase = False + ecosurv_landlords = None + + # NCHA + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/NCHA" + data_filename = "Energy Info Copy.xlsx" + sheet_name = "Data" + postcode_column = 'Postcode' + fulladdress_column = "Address" + address1_column = None + address1_method = "house_number_extraction" + address_cols_to_concat = [] + missing_postcodes_method = None + landlord_year_built = "Build Date (HAR10)" + landlord_os_uprn = None + landlord_property_type = "Property Type (HAR10)" + landlord_built_form = "Build Form (EPC)" + landlord_wall_construction = "Wall Description" + landlord_roof_construction = None + landlord_heating_system = "Heating System" + landlord_existing_pv = None + landlord_property_id = "Place ref" + landlord_sap = "EPC SAP" + outcomes_filename = None + outcomes_sheetname = None + outcomes_postcode = None + outcomes_houseno = None + outcomes_id = None + outcomes_address = None + master_filepaths = [] + master_to_asset_list_filepath = None + phase = False + ecosurv_landlords = None + # Torus data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Torus/Phase 1" data_filename = "Torus Property Asset List - Phase 1.xlsx" @@ -482,7 +579,7 @@ def app(): # We now flag properties that have been treated under existing programmes asset_list.flag_outcomes( - outcomes_filepath=os.path.join(data_folder, outcomes_filename) if outcomes_filename else None, + outcomes_filepaths=outcomes_filename, outcomes_sheetname=outcomes_sheetname, outcomes_address=outcomes_address, outcomes_postcode=outcomes_postcode, @@ -611,6 +708,12 @@ def app(): transformed_data.append(row_data) transformed_df = pd.DataFrame(transformed_data) + for col in [ + "Floor insulation (solid floor)", + "Floor insulation", "Floor insulation (suspended floor)" + ]: + if col not in transformed_df.columns: + transformed_df[col] = False transformed_df = transformed_df[ [ asset_list.DOMNA_PROPERTY_ID, "Floor insulation (solid floor)", diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py index b5cf500f..e255ba4d 100644 --- a/asset_list/mappings/heating_systems.py +++ b/asset_list/mappings/heating_systems.py @@ -220,5 +220,48 @@ HEATING_MAPPINGS = { 'Boiler/ underfloor': 'electric underfloor', 'Storage system': "non-electric underfloor", 'BOILER': 'gas combi boiler', - 'SPACE_HEATER': 'room heaters' + 'SPACE_HEATER': 'room heaters', + 'AIR': 'air source heat pump', + 'FSOL': 'solid fuel', + 'PDEV': 'unknown', + 'GASF': 'gas boiler, radiators', + 'CONO': 'no heating', + 'FELE HRSH': 'high heat retention storage heaters', + 'FOIL': 'oil boiler', + 'FDEV': 'unknown', + 'FNON': 'non-electric underfloor', + 'FGAS': 'gas combi boiler', + 'FELE': 'electric fuel', + 'GRNE': 'ground source heat pump', + + 'High Heat Storage Heaters': 'high heat retention storage heaters', + 'Electric Radiators': 'electric radiators', + 'Electric Air Source Heat Pump': 'air source heat pump', + 'Gas Combi Condensing Boiler': 'gas condensing combi', + 'Electric Boiler Heating': 'electric boiler', + 'Solid Fuel Open Back Boiler Heating': 'solid fuel', + 'Solid Fuel Closed Back Boiler Heating': 'solid fuel', + 'Oil Boiler': 'oil boiler', + 'Electric Storage Heaters': 'electric storage heaters', + 'Gas Combi Boiler Heating': 'gas combi boiler', + 'Electric NIBE Heating System': 'air source heat pump', + 'Gas Back Boiler': 'gas boiler, radiators', + 'Electric Gel/Oil Filled Radiators': 'electric radiators', + 'No Information': 'unknown', + 'Oil Combination Boiler Heating': 'oil boiler', + 'Electric DSR Heat Retention Radiators': 'high heat retention storage heaters', + 'Communal Heating System': 'communal heating', + 'Description': 'unknown', + 'Oil Combi Condensing Boiler Heating': 'oil boiler', + 'Gas Combi Condensing Boiler Heating': 'gas condensing combi', + 'Electric Warm Air Heating': 'electric fuel', + 'Gas System Boiler Heating': 'gas boiler, radiators', + 'Gas Back Boiler Heating': 'gas boiler, radiators', + 'Electric Gel/Oil Fllled Radiators': 'electric radiators', + 'Gas Condensing Boiler Heating': 'gas condensing combi', + 'Gas Combi Condensing Boiler Heatiner': 'gas condensing combi', + 'Oil Standard Boiler Heating': 'oil boiler', + 'Oil Condensing Boiler Heating': 'oil boiler', + 'Electric ASHP': 'air source heat pump', + 'Modern Slimline Storage Heaters': 'electric storage heaters' } diff --git a/asset_list/mappings/outcomes.py b/asset_list/mappings/outcomes.py new file mode 100644 index 00000000..c376267f --- /dev/null +++ b/asset_list/mappings/outcomes.py @@ -0,0 +1,231 @@ +""" +This script was produced to handle the non-standard outcomes, observed in the LHP outcomes sheet +""" +import numpy as np + +outcomes_values = [ + "Access Issues", "No Outcome", "Asked for a later date", "Customer Refusal", + "Installer Refusal", "No Answer", "Not Viable", "Surveyed", + "Rescheduled", "Not Knocked", "Void" +] + +outcomes_map = { + 'Access issues, shed against rear wall. Sent photos to Matt JJC, declined': 'Access Issues', + 'NO ANSWER /TICKET LEFT': 'No Answer', + 'Looks Void - No Answer': 'No Answer', + 'No Answer - they were in - No response to my drop card': 'No Answer', 'No Answer': 'No Answer', + 'No Answer - Even they were in - No response to my drop card': 'No Answer', 'no answer': 'No Answer', + 'NO ANSWER': 'No Answer', 'No answer': 'No Answer', + np.nan: 'unknown', + 'Access Issues Health reasons try another time': 'Access Issues', + 'LOFT FULL, CUSTOMER WONT REMOVE': 'Access Issues', + 'Failed Appointment - Ivy': 'Access Issues', + 'Failed Appointment - Void soon': 'Void', + 'Hoarding in loft': 'Access Issues', + 'Non Complained - Extension at rear and side': 'Not Viable', + 'Said No letter - then texted me I can only do outside but cant come in': 'Customer Refusal', + 'Hoarding - unwilling to shift from loft': 'Customer Refusal', + 'Overgrown vegatation - Happy for HA to deal with': 'Access Issues', + 'No access to side of property': 'Not Viable', + 'Very rude': 'Customer Refusal', + 'REFUSED ACCESS': 'Customer Refusal', + 'SURVEYED': 'Surveyed', + 'ELECTRIC ROOM HEATERS. Kieran to check re funding and possible PV?': 'Not Viable', + 'SUBMITTED': 'Surveyed', + '2 single storey extensions': 'Not Viable', + 'Rebook': 'Rescheduled', + 'surveyed': 'Surveyed', + 'not intrested': 'Customer Refusal', + 'Fixed seating area against rear elevation': 'Not Viable', + "Matt said can't install": 'Installer Refusal', + 'Gave excuses to come this and that time and no reponse': 'No Answer', + 'NOT KNOCKED': 'Not Knocked', + 'VOID PROPERTY': 'Void', + 'Glass lean to. JJC declined': 'Installer Refusal', + 'Left slip Overgrown vegatation': 'No Answer', + 'covid': 'Rescheduled', + 'Lean-to on side elevation': 'Not Viable', + 'Opted out as moving out': 'Customer Refusal', + 'Surveyed': 'Surveyed', + 'refused': 'Customer Refusal', + 'COVID': 'Rescheduled', + 'Said No letter received and didn’t answer again': 'No Answer', + 'Survey completed': 'Surveyed', + 'Loft fully boarded': 'Access Issues', + 'Not Available during the day': 'No Answer', + 'Conservatory. JJC declined.': 'Installer Refusal', + 'Booked for 19.10.23': 'Rescheduled', + 'LETTER LEFT': 'No Answer', + 'Knocked/lettered': 'No Answer', + 'Survey Complete': 'Surveyed', + 'Refused by calling office': 'Customer Refusal', + 'Extension on rear elevation': 'No Viable', + 'Left Slip - Potential access issue with conservatory': 'Access Issues', + 'Overgrown vegatation': 'Access Issues', + 'Left slip Overgrown Ivy and Hedge': 'No Answer', + 'NOT AVAILABLE THIS WEEK': 'No Answer', + 'Unwilling to clear loft': 'Access Issues', + 'survey complete': 'Surveyed', + 'ivy on wall': 'Access Issues', + 'not in': 'No Answer', + 'Covid shrub very close to building': 'Rescheduled', + 'ON HOLIDAY, UNDER 18 IN HOUSE': 'Rescheduled', + 'wont do as extention': 'Not Viable', + 'IN, WONT ANSWER': 'Customer Refusal', + 'Too many plants next to the walls': 'Access Issues', + 'obstructions': 'Access Issues', + 'Left slip -Wall plant': 'Access Issues', + 'On holiday': 'No Answer', + 'Failed appointment': 'No Answer', + 'LOFT FULLY BOARDED': 'Access Issues', + 'ivy and didn’t want people inside the house': 'Customer Refusal', + 'Partly IWI': 'Not Viable', + 'Covid': 'Rescheduled', + 'REFUSE TO REMOVE IVY': 'Access Issues', + 'Insulated 2 years ago. Carbon bead in walls, 300mm rock wool in loft': 'Not Viable', + 'INCONVIENIENT TIME': 'No Answer', + 'EXT TO REAR': 'Not Viable', + 'Not In': 'No Answer', + 'Damp issues.Black mould on walls': 'Access Issues', + 'Lean to. JJC declined': 'Installer Refusal', + 'DISABLED CHILD / INCONVIENIENT': 'Customer Refusal', + 'Plants on wall': 'Access Issues', + 'Left Slip': 'No Answer', + 'Never answered': 'No Answer', + 'SOLAR PV CONNECTED TO MAINS': 'Not Viable', + 'Bungalow': 'unknown', + 'call back': 'No Answer', + 'Message from WFT OFFICE; tenant unavailable this week, no telephone number provided': 'Rescheduled', + 'LEAN TO PRESENT': 'Not Viable', + 'She said come Tuesday and never answered': 'Rescheduled', + 'Sold': 'Surveyed', + 'Too much mould and cluttered house': 'Access Issues', + 'Overgrown vegatation will call when clear': 'Access Issues', + 'LOFT DEC 2013': 'Not Viable', + 'Ivy': 'Access Issues', + 'Booked for next week': 'Rescheduled', + 'empty': 'Void', + 'Been told property is empty as tenant has passed away': 'Void', + 'Non Complianced - Single Storey Extension to the front and rear': 'Not Viable', + 'Going back this week': 'Rescheduled', + 'Loft insulated in last few months. Ongoing damp issues in bathroom, black mould up wall': 'Access Issues', + 'rear Extension': 'Not Viable', + 'DECKING AROUND PROPERTY IN BREACH OF DPC BY 300MM': 'Not Viable', + 'Said no letter received': 'Customer Refusal', + 'Unwell, not convenient this week': 'Rescheduled', + 'IVY on Wall': 'Access Issues', + 'REFUSED EXTRACTOR': 'Customer Refusal', + 'ON HOLIDAY': 'Rescheduled', + 'COVID. Not this week.': 'Rescheduled', + 'COVID POSITIVE': 'Rescheduled', + 'VOID. Appears to be under refurbishment': 'Void', + 'Survey Completed': 'Surveyed', + 'INCONVIENIENT': 'Rescheduled', + 'Knocked/lettered. 07598 112360': 'No Answer', + 'Single skin lean to. JJC declined': 'Installer Refusal', + 'DENIES LETER, REFUSED ACCESS': 'Customer Refusal', + 'Loft hoard unable to clear': 'Access Issues', + 'Left Slip - Look Void': 'Void', + 'EXCESSIVE IVY GROWTH, CUSOMER UNABLE TO REMOVE, ELDERLEY': 'Access Issues', + 'Refused': 'Customer Refusal', + 'REFUSED / INCONVENIENT': 'Customer Refusal', + 'AGGRESSIVE DOGS LOOSE IN FRONT GARDEN': 'Access Issues', + 'EXCESSIVE IVY': 'Access Issues', + "Won't remove plastic roof": 'Access Issues', + 'SURVEY COMPLETED': 'Surveyed', + 'VOID. Under refurbishment. Electric storage heating currently removed for refurbishment': 'Void', + 'Surveyed ECO4': 'Surveyed', + 'after 5.30': 'Rescheduled', + 'CUSTOMER IN, WONT ANSWER DOOR': 'No Answer', + 'IVY': 'Access Issues', + 'Single storey extension on gable': 'Not Viable', + 'No answer.': 'No Answer', + 'Full extension at rear. Not viable.': 'Not Viable', + 'Access issues': 'Access Issues', + 'VOID PROPERTY NOW': 'Void', + 'Not viable': 'Not Viable', + 'Looks like a VOID property': 'Void', + 'NOT VIABLE': 'Not Viable', + 'No Answer.': 'No Answer', + 'Not viable.': 'Not Viable', + 'Looks to be void.': 'Void', + 'Access issues and loft fully boarded/full': 'Access Issues', + 'Extension on property. Not Viable': 'Not Viable', + 'No good. Serious Access issues.': 'Access Issues', + 'Surveyed and Submitted': 'Surveyed', + 'UNSANITARY CONDITIONS, RUBBISH EVERYWHERE': 'Access Issues', + 'Will call when rubbish removed.': 'Access Issues', + 'Covered in Ivy': 'Access Issues', + 'CUSTOMER REFUSED': 'Customer Refusal', + 'Still covered in ivy': 'Access Issues', + 'CUSTOMER SHOUTED OUT OF WINDOW TO COME BACK ANOTHER TIME': 'Customer Refusal', + "Extension on property, can't be done.": 'Not Viable', + 'Will be looking to do Survey WC 19.02': 'Rescheduled', + "Tenant was working, couldn't do survey.": 'No Answer', + 'PROPERTY EMPTY, SPOKE TO EX TENNANT WHO LEFT 3 WEEKS AGO?': 'Void', + 'Will call back.': 'Rescheduled', + "Tenant not interested. Won't empty loft.": 'Customer Refusal', + "Won't answer door.": 'Customer Refusal', + "Tenant 'Doesn't want anything to do with LHP'": 'Customer Refusal', + "Loft full. Tenant won't empty.": 'Access Issues', + 'Covered in foliage': 'Access Issues', + 'Customer not home for appointment.': 'No Answer', + 'Blown in bead': 'Not Viable', + 'Distance to property to far from road.': 'Access Issues', + 'LOFT FULL, CUSTOMER UNABLE TO CLEAR': 'Access Issues', + 'Stuff against rear wall. Will call when removed.': 'Access Issues', + 'Will call when rubbish is removed': 'Access Issues', + 'Mid Terrace': 'unknown', + 'Tile Hung areas.': 'Not Viable', + 'REFUSED / UNABLE TO CLEAR LOFT': 'Customer Refusal', + 'Calling back on Monday (19.02)': 'Rescheduled', + 'Solid Wall': 'Not Viable', + 'FAULTY PHONE NUMBER, 3 X KNOCK, LETTER LEFT ON FIRST ATTEMPT, NO REPLY OR CALL BACK': 'No Answer', + 'Not interested': 'Customer Refusal', + 'ACCESS DENIED': 'Customer Refusal', + 'Covered in Ivy.': 'Access Issues', + 'UNABLE TO GENERATE SAP GAIN WITH EXTENSIONS FRONT AND REAR': 'Not Viable', + 'Extension on the property.': 'Not Viable', + "Covered in Ivy. Can't remove it.": 'Access Issues', + 'Booked in, but not in when called back': 'No Answer', + 'EXCESSIVE IVY ON WALLS (SEE PICS)': 'Access Issues', + 'Moved out': 'Void', + 'Buying the property. Not interested.': 'Customer Refusal', + 'Not been to yet': 'No Answer', + 'CUSTOMER STATES LOFT WAS INSULATED A FEW MONTHS AGO BY LHP': 'Customer Refusal', + 'Will try again.': 'No Answer', + 'HOUSE MARTINS NESTING IN EAVES OF 3 ADJOINING PROPERTIES': 'Access Issues', + 'Told me to call back': 'Rescheduled', + 'CUSTOMER SAYS PROPERTY ALREADY REFUSED AT PREVIOUS SURVEY, NO REASON GIVEN': 'Customer Refusal', + "Won't answer the door.": 'Customer Refusal', + 'Tenant not interested.': 'Customer Refusal', + 'Keep trying, keeps putting me off.': 'Customer Refusal', + 'Already insulated.': 'Not Viable', + 'Works all day.': 'No Answer', + 'PROPERTY COVER IN FOILAGE AND SHRUBS': 'Access Issues', + 'ACCESS IVY GROWTH, LEAN TO / CONSERVATORY IN WAY OF REAR': 'Not Viable', + "Tenant unwell. Doesn't want survey.": 'No Answer', + 'Wont empty loft.': 'Access Issues', + 'LOFT FULLY BOARDED AS PREVIOUSLY DISCUSSED WITH CUSTOMER BY PREVIOUS SURVEYOR': 'Access Issues', + "Property can't be done.": 'Not Viable', + 'Works everyday. Will call.': 'No Answer', + 'A LOT OF FOLIAGE IN WAY, PROPERTY LOOKS EMPTY FROM OUTSIDE?': 'Void', + "Very old tenant. Said they didn't want it.": 'Customer Refusal', + 'Covered in ivy. Unable to remove.': 'Access Issues', + 'Climbers on walls': 'Access Issues', + 'Will not remove foliage': 'Access Issues', + 'Not Interested.': 'Customer Refusal', + 'OFF GAS': 'unknown', + 'Tenant not interested': 'Customer Refusal', + 'Will call me. Left my number.': 'Rescheduled', + 'Keep trying but keeps putting me off': 'Customer Refusal', + 'Moving out.': 'Void', + 'Booked in': 'Recheduled', + 'Refused Survey': 'Customr Refusal', + 'Big dogs running around front garden.': 'Access Issues', + 'CUSTOMER HAS CLADDED WALL AT REAR IN CONSERVATORY, REFUSED INTERNAL DRILL': 'Customer Refusal', + 'Booked in.': 'Rescheduled', + 'WRONG ADDRESS?': 'unknown', + 'Works everyday. Will call me.': 'No Answer', + 'Will not remove foliage.': 'Access Issues' +} diff --git a/asset_list/mappings/property_type.py b/asset_list/mappings/property_type.py index f01ab5eb..225d1a1f 100644 --- a/asset_list/mappings/property_type.py +++ b/asset_list/mappings/property_type.py @@ -194,5 +194,17 @@ PROPERTY_MAPPING = { 'Maisonette 2 Ext. Wall': 'maisonette', '5 Ext. Wall Flat': 'flat', 'Bungalow Semi Detached': 'bungalow', - 'COMINT': 'unknown' + 'COMINT': 'unknown', + '12 SBEDSIT': 'bedsit', + '01 HOUSE': 'house', + '05 BEDSIT': 'bedsit', + '14 SFLAT': 'flat', + '09 PBEDSIT': 'bedsit', + '10 PBUNGALOW': 'bungalow', + '13 SBUNGALOW': 'bungalow', + '11 PFLAT': 'flat', + '02 FLAT': 'flat', + '04 MAISONETTE': 'maisonette', + '01 HOUSE MID': 'house', + '03 BUNGALOW': 'bungalow' } From 5848cb5314a991bf5cb2a48b0e0d2c915dc01446 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 7 May 2025 17:43:14 +0100 Subject: [PATCH 4/6] working on unitas data standardisation --- asset_list/AssetList.py | 77 +++++++++++++++++--------- asset_list/app.py | 62 ++++++++++++--------- asset_list/mappings/built_form.py | 40 ++++++++++++- asset_list/mappings/heating_systems.py | 15 ++++- asset_list/mappings/property_type.py | 15 ++++- asset_list/mappings/roof.py | 16 +++++- 6 files changed, 168 insertions(+), 57 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 4586ae57..dcc7290a 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -5,6 +5,7 @@ import tiktoken from pprint import pprint from datetime import datetime +from numpy.ma.core import masked_not_equal from openai import OpenAI import numpy as np import pandas as pd @@ -2179,7 +2180,7 @@ class AssetList: return # TODO: Fetch from Sharepoint - ecosurv_filepath = "/Users/khalimconn-kowlessar/Documents/hestia/Ecosurv/15.04.csv" + ecosurv_filepath = "/Users/khalimconn-kowlessar/Documents/hestia/Ecosurv/07.05.2025.csv" logger.info("Getting Ecosurv data from %s", ecosurv_filepath) self.ecosurv = pd.read_csv(ecosurv_filepath, encoding="cp437") @@ -2310,7 +2311,7 @@ class AssetList: nomatch_i = [] for _, x in tqdm(outcomes.iterrows(), total=len(outcomes)): - if pd.isnull(x[outcomes_address[idx]]): + if pd.isnull(x[outcomes_address[idx]]) or not x[outcomes_address[idx]]: continue # Check if we have an id @@ -2448,7 +2449,7 @@ class AssetList: return pd.to_datetime(match.group(1), format="%d.%m.%Y", errors="coerce") return pd.NaT - lookup['parsed_date'] = lookup['Date letters sent'].apply(extract_date) + lookup['parsed_date'] = lookup[date_col].apply(extract_date) def get_latest_note(group): surveyed = group[group['Outcome'] == 'surveyed'] @@ -2457,8 +2458,11 @@ class AssetList: else: return group.sort_values('parsed_date', ascending=False).iloc[0] - latest_note = lookup.groupby('domna_property_id', group_keys=False).apply(get_latest_note).reset_index( - drop=True) + latest_note = ( + lookup.groupby('domna_property_id', group_keys=False). + apply(get_latest_note). + reset_index(drop=True) + ) latest_note = latest_note[["domna_property_id", notes_col]] pivot_df = lookup.groupby(["domna_property_id", "Outcome"]).size().unstack(fill_value=0).reset_index() @@ -2513,36 +2517,43 @@ class AssetList: # Strip columns master_data.columns = [c.strip() for c in master_data.columns] master_data.columns = [re.sub(r'\s+', ' ', c) for c in master_data.columns] + # Drop any unnamed columns + unnamed_columns = [c for c in master_data.columns if "Unnamed:" in c] + master_data = master_data.drop(columns=unnamed_columns) if not id_map.empty: master_data = master_data.merge( id_map, how="left", on=['NO.', 'Street / Block Name', 'Post Code'] ) - install_col = ( - "INSTALLED OR CANCELLED" if "INSTALLED OR CANCELLED" in master_data.columns - else "INSTALL / CANCELLATION DATE" - ) + if "INSTALLED OR CANCELLED" in master_data.columns: + install_col = "INSTALLED OR CANCELLED" + elif "INSTALL / CANCELLATION DATE" in master_data.columns: + install_col = "INSTALL / CANCELLATION DATE" + elif 'INSTALL/ CANCELLATION DATE' in master_data.columns: + install_col = 'INSTALL/ CANCELLATION DATE' + else: + raise ValueError("No install or cancellation date") submission_col = ( "SUBMISSION DATE" if "SUBMISSION DATE" in master_data.columns else "SUBMISSION DATE TO INSTALLERS" ) - if "UPRN" in master_data.columns: - # We just need to check if any were cancelled - master_to_append = master_data[ - ["UPRN", install_col, submission_col] - ].rename( - columns={ - "UPRN": self.STANDARD_LANDLORD_PROPERTY_ID, - install_col: "survey_status", - submission_col: "submission_date" - } - ) - master_to_append["cancelled"] = master_to_append["survey_status"].str.lower().str.contains("cancel") - - master_surveyed.append(master_to_append) - continue + # if "UPRN" in master_data.columns: + # # We just need to check if any were cancelled + # master_to_append = master_data[ + # ["UPRN", install_col, submission_col] + # ].rename( + # columns={ + # "UPRN": self.STANDARD_LANDLORD_PROPERTY_ID, + # install_col: "survey_status", + # submission_col: "submission_date" + # } + # ) + # master_to_append["cancelled"] = master_to_append["survey_status"].str.lower().str.contains("cancel") + # + # master_surveyed.append(master_to_append) + # continue master_data["row_id"] = master_data.index @@ -2557,23 +2568,35 @@ class AssetList: house_no_col = 'NO.' if 'NO.' in master_data.columns else "NO" # Otherwise, we need to match algorithmically + has_property_id = "UPRN" in master_data.columns logger.info("Matching master data to asset list") matched = [] unmatched = [] for _, row in tqdm(master_data.iterrows(), total=len(master_data)): + if pd.isnull(row[postcode_col]): continue + + # if has_property_id: + # submission_uprn = row["UPRN"] + # + # if not pd.isnull(submission_uprn): + # df = self.standardised_asset_list[ + # self.standardised_asset_list[self.STANDARD_LANDLORD_PROPERTY_ID] == submission_uprn + # ] + postcode_no_space = row[postcode_col].strip().replace(" ", "").lower() df = self.standardised_asset_list[ ( - self.standardised_asset_list[self.STANDARD_POSTCODE].str.strip().str.lower().str.replace(" ", - "") - == postcode_no_space + self.standardised_asset_list[self.STANDARD_POSTCODE] + .str.strip().str.lower().str.replace(" ", "") == postcode_no_space ) ] house_no = row[house_no_col] + if isinstance(house_no, float): + house_no = str(int(house_no)) if house_no in df["house_no"].values: df = df[df["house_no"] == house_no] diff --git a/asset_list/app.py b/asset_list/app.py index 14322a97..76e09295 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -62,32 +62,42 @@ def app(): Property UPRN """ - # TODO: - # For cavity work: - # - Flag any entries that have a different wall type between non-intrusive data against EPC - # - Worth double checking entries that have a difference in wall construction - # - Look at anything that is flagged as an empty cavity but the EPC data says it’s a filled cavity - # - Look at the current EPC scores - Anything that is C75 or above, especially if it’s assumed no insulation - # - By postcode, we can try and deduce if all of the addresses are a flats and then estimate if 50% of the flats - # are less than C75 - # - Flag anything pre SAP2012 - # - Flag anything over 5 years old - # - Look at year built vs age band - # - # For Solar: - # - Discount any that have solar PV - based on non-intrusives and from the inspections team - # - In the heating, discount anything that isn’t ashp, ghsp, hhrs, electric storage - possibly homes with - # electric room heaters but it might need to be an EPC E - # - Fabric - check the floor, wall and roof: - # - Filled or empty cavity is good - # - Insulated solid/timber/system built is good - # - SCIS/CEG needs solid floors - # - JJC don’t care - # - Anything with a loft 200 or below - # - Anything C75 and above won’t qualify - # - Insulated loft = 200mm - # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) - # - Or the insulation required is loft/cavity (floors should be solid) + # Unitas + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Unitas" + data_filename = "UNITAS - Asset List.xlsx" + sheet_name = "Asset List" + postcode_column = 'Post Code' + fulladdress_column = "Address Line 1" + address1_column = "Address Line 1" + address1_method = None + address_cols_to_concat = [] + missing_postcodes_method = None + landlord_year_built = "built year" + landlord_os_uprn = None + landlord_property_type = "Property Type" + landlord_built_form = "Expanded Property Type" + landlord_wall_construction = None + landlord_roof_construction = "loft insulation" + landlord_heating_system = "Bolier Make" + landlord_existing_pv = None + landlord_property_id = "Property Reference" + landlord_sap = "Sap Rating" + outcomes_filename = [ + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Unitas/Unitas - All outcomes - 24.04.2025.xlsx", + ] + outcomes_sheetname = ["Feedback"] + outcomes_postcode = ["Postcode"] + outcomes_houseno = ["No."] + outcomes_id = [None] + outcomes_address = ["Address"] + master_filepaths = [ + os.path.join(data_folder, "Submissions ECO 3.csv"), + os.path.join(data_folder, "Submissions ECO 4 - PHASE 1.csv"), + os.path.join(data_folder, "Submissions ECO 4 - PHASE 2.csv") + ] + master_to_asset_list_filepath = None + phase = False + ecosurv_landlords = "unitas|everill|baskeyfield" # LHP: data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/LHP" diff --git a/asset_list/mappings/built_form.py b/asset_list/mappings/built_form.py index 1d0aecf5..6ce31cdd 100644 --- a/asset_list/mappings/built_form.py +++ b/asset_list/mappings/built_form.py @@ -212,5 +212,43 @@ BUILT_FORM_MAPPINGS = { '5 Ext. Wall Flat': 'unknown', 'Unknown': 'unknown', 'Enclosed mid-terrace': 'mid-terrace', - 'Enclosed end-terrace': 'end-terrace' + 'Enclosed end-terrace': 'end-terrace', + + 'House GROUND FLOOR': 'ground floor', + 'Flat? GROUND FLOOR': 'ground floor', + 'House SD SEMI DETACHED': 'semi-detached', + 'House SEMI DETACHED': 'semi-detached', + 'Flat GROUND FLOOR': 'ground floor', + '': 'unknown', + 'Flat SEVENTH FLOOR': 'mid-floor', + 'House D': 'detached', + 'House ET': 'end-terrace', + 'House SD Homeless Unit': 'semi-detached', + 'House MT Homeless Unit': 'mid-terrace', + 'Bungalow ET': 'end-terrace', + 'Bungalow D': 'detached', + 'House SD': 'semi-detached', + 'Bungalow Sheltered Accomodation': 'unknown', + 'House. SD': 'semi-detached', + 'Flat FIRST FLOOR MAISONETTE': 'ground floor', + 'Bungalow SD': 'semi-detached', + 'Flat FIRST FLOOR': 'ground floor', + 'Flat Sheltered Accomodation': 'unknown', + 'Flat SIXTH FLOOR': 'mid-floor', + 'Flat EIGHTH FLOOR': 'mid-floor', + 'Flat FOURTH FLOOR': 'mid-floor', + 'Flat Homeless Unit': 'unknown', + 'Bungalow MT': 'mid-terrace', + 'Bungalow Homeless Unit': 'unknown', + 'House MT': 'mid-terrace', + 'Flat FIFTH FLOOR': 'mid-floor', + 'Flat NINTH FLOOR': 'mid-floor', + 'House SD FIRST FLOOR': 'semi-detached', + 'Bungalow Supported housing': 'unknown', + 'Flat THIRD FLOOR': 'mid-floor', + 'Flat SECOND FLOOR': 'mid-floor', + 'House Homeless Unit': 'unknown', + 'Flat ELEVENTH FLOOR': 'mid-floor', + 'Flat TENTH FLOOR': 'mid-floor', + 'House. MT': 'mid-terrace' } diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py index e255ba4d..aceecd8f 100644 --- a/asset_list/mappings/heating_systems.py +++ b/asset_list/mappings/heating_systems.py @@ -263,5 +263,18 @@ HEATING_MAPPINGS = { 'Oil Standard Boiler Heating': 'oil boiler', 'Oil Condensing Boiler Heating': 'oil boiler', 'Electric ASHP': 'air source heat pump', - 'Modern Slimline Storage Heaters': 'electric storage heaters' + 'Modern Slimline Storage Heaters': 'electric storage heaters', + # These are boiler makes from Unitas + 'UNKNOWN': 'unknown', + 'IDEAL': 'gas combi boiler', + 'VAILLANT': 'gas combi boiler', + 'THORN': 'gas combi boiler', + 'WORCESTER': 'gas combi boiler', + 'GLOW WORM': 'gas combi boiler', + 'VOKERA': 'gas combi boiler', + 'POTTERTON': 'gas combi boiler', + 'BAXI SOLO': 'gas combi boiler', + 'BAXI BERMUDA': 'gas combi boiler', + 'BAXI': 'gas combi boiler' + } diff --git a/asset_list/mappings/property_type.py b/asset_list/mappings/property_type.py index 225d1a1f..303ba0b3 100644 --- a/asset_list/mappings/property_type.py +++ b/asset_list/mappings/property_type.py @@ -206,5 +206,18 @@ PROPERTY_MAPPING = { '02 FLAT': 'flat', '04 MAISONETTE': 'maisonette', '01 HOUSE MID': 'house', - '03 BUNGALOW': 'bungalow' + '03 BUNGALOW': 'bungalow', + 'Flat?': 'flat', + 'Bungalow ET': 'bungalow', + 'House. SD': 'house', + 'Bungalow SD': 'bungalow', + 'Bungalow D': 'bungalow', + 'House D': 'house', + 'House SD': 'house', + 'House ET': 'house', + 'Bungalow MT': 'bungalow', + 'House MT': 'house', + 'House. MT': 'house', + '': 'unknown' + } diff --git a/asset_list/mappings/roof.py b/asset_list/mappings/roof.py index a95f0529..03d6f9af 100644 --- a/asset_list/mappings/roof.py +++ b/asset_list/mappings/roof.py @@ -6,6 +6,7 @@ STANDARD_ROOF_CONSTRUCTIONS = { "pitched unknown access to loft", "piched unknown insulation", "pitched insulated", + "pitched less than 100mm insulation" "another dwelling above", "flat unknown insulation", "unknown insulated", @@ -23,5 +24,18 @@ ROOF_CONSTRUCTION_MAPPINGS = { '2018 onwards': 'unknown', 'Pitched (vaulted ceiling)': 'pitched insulated', np.nan: "unknown", - None: "unknown" + None: "unknown", + 'Unknown': 'unknown', + '270mm': 'pitched insulated', + '300mm+': 'pitched insulated', + '100mm': 'pitched less than 100mm insulation', + '250mm': 'pitched insulated', + '300mm': 'pitched insulated', + 'No Loft space': 'pitched no access to loft', + '75mm': 'pitched less than 100mm insulation', + '150mm': 'pitched insulated', + 'No Loft Hatch': 'pitched unknown access to loft', + '200mm': 'pitched insulated', + '0-49mm': 'pitched less than 100mm insulation', + '50mm': 'pitched less than 100mm insulation', } From f1b9ee2920f6871ee576979c9b40faf2dfbe8163 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 12 May 2025 15:58:41 +0100 Subject: [PATCH 5/6] preparing programme for mhs --- asset_list/AssetList.py | 156 +++- asset_list/app.py | 38 +- asset_list/mappings/built_form.py | 17 +- asset_list/mappings/heating_systems.py | 20 +- asset_list/mappings/property_type.py | 11 +- asset_list/mappings/walls.py | 11 +- .../Community Housing/reconciliation.py | 708 ++++++++++++++++++ .../Westward/Route March Reconciliation.py | 51 ++ etl/customers/l_and_g/risk_matrix.py | 147 +--- etl/customers/mhs/prepare_data.py | 60 ++ .../unitas/preparing_programme_rebuild.py | 28 + 11 files changed, 1063 insertions(+), 184 deletions(-) create mode 100644 etl/customers/Community Housing/reconciliation.py create mode 100644 etl/customers/Westward/Route March Reconciliation.py create mode 100644 etl/customers/mhs/prepare_data.py create mode 100644 etl/customers/unitas/preparing_programme_rebuild.py diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index dcc7290a..94c3c235 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -693,6 +693,9 @@ class AssetList: c for c in self.OLD_FORMAT_NON_INTRUSIVE_COLNAMES if c in self.standardised_asset_list.columns ] + if "Warmfront Finding" in self.standardised_asset_list.columns: + non_intrusive_columns.append("Warmfront Finding") + self.keep_variables += non_intrusive_columns self.rename_map = { @@ -931,7 +934,10 @@ class AssetList: raise ValueError(f"Dataframe must contain the column {self.DOMNA_PROPERTY_ID}") if df[self.DOMNA_PROPERTY_ID].duplicated().sum(): - raise ValueError(f"{self.DOMNA_PROPERTY_ID} contains duplicated IDs") + df = df.drop_duplicates( + subset=[self.DOMNA_PROPERTY_ID], + keep="first" + ) self.standardised_asset_list = self.standardised_asset_list.merge( df, how="left", on=self.DOMNA_PROPERTY_ID @@ -1260,7 +1266,7 @@ class AssetList: ) self.standardised_asset_list["non_intrusive_indicates_empty_cavity_has_solar"] = ( - pd.isnull(self.standardised_asset_list["non_intrusive_indicates_empty_cavity"]) & + ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] & (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & non_intrusives_wall_filter & year_built_filter & @@ -1272,23 +1278,35 @@ class AssetList: # We also add a filter on anything that was generally identified by the non-intrusives self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_year_filter"] = ( - pd.isnull(self.standardised_asset_list["non_intrusive_indicates_empty_cavity"]) & - pd.isnull(self.standardised_asset_list["non_intrusive_indicates_empty_cavity_has_solar"]) & + ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] & + ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity_has_solar"] & (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & non_intrusives_wall_filter ) - self.standardised_asset_list["epc_indicates_empty_cavity"] = ( - self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]].str.lower().isin( - self.EPC_NO_WALL_INSULATION_DESCRIPTIONS - ) & ( - self.standardised_asset_list["epc_year_upper_bound"] <= self.EMPTY_CAVITY_YEAR_THRESHOLD - ) & ( - ~self.standardised_asset_list[self.ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD] - ) & ( - ~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"]) + if (not self.non_intrusives_eligibility) and (not self.old_format_non_intrusives_present): + # If we have NO inspections data, we capture all of the wall types and don't filter on age of the EPC + self.standardised_asset_list["epc_indicates_empty_cavity"] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]].str.lower().isin( + self.EPC_NO_WALL_INSULATION_DESCRIPTIONS + ) & ( + self.standardised_asset_list["epc_year_upper_bound"] <= self.EMPTY_CAVITY_YEAR_THRESHOLD + ) & ( + ~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"]) + ) + ) + else: + self.standardised_asset_list["epc_indicates_empty_cavity"] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]].str.lower().isin( + self.EPC_NO_WALL_INSULATION_DESCRIPTIONS + ) & ( + self.standardised_asset_list["epc_year_upper_bound"] <= self.EMPTY_CAVITY_YEAR_THRESHOLD + ) & ( + ~self.standardised_asset_list[self.ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD] + ) & ( + ~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"]) + ) ) - ) self.standardised_asset_list["landlord_data_indicates_empty_cavity"] = ( self.standardised_asset_list[self.STANDARD_WALL_CONSTRUCTION].isin(["uninsulated cavity"]) & @@ -1336,6 +1354,9 @@ class AssetList: self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = ( extraction_wall_filter & year_built_filter ) + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_year_filter"] = ( + extraction_wall_filter & ~year_built_filter + ) elif self.old_format_non_intrusives_present: print("Review these categories!!!!") @@ -1349,10 +1370,11 @@ class AssetList: self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = ( extraction_wall_filter ) + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_year_filter"] = False else: self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = False - self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] = False + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_year_filter"] = False ###################################################### # Solar @@ -1480,7 +1502,7 @@ class AssetList: ) # We merge on the u-value for average thermal transmittance - roof_roof_data = pd.DataFrame(cleaned["roof-description"])[ + roof_data = pd.DataFrame(cleaned["roof-description"])[ ["original_description", "thermal_transmittance", "is_pitched", "is_loft"] ].rename( columns={ @@ -1490,7 +1512,7 @@ class AssetList: ) self.standardised_asset_list = self.standardised_asset_list.merge( - roof_roof_data, how="left", on=self.EPC_API_DATA_NAMES["roof-description"] + roof_data, how="left", on=self.EPC_API_DATA_NAMES["roof-description"] ) # If the u-value of a roof is less than 0.7 we consider it insulated @@ -1749,6 +1771,16 @@ class AssetList: self.standardised_asset_list["cavity_reason"] ) + self.standardised_asset_list["cavity_reason"] = np.where( + ( + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_year_filter"] & + pd.isnull(self.standardised_asset_list["cavity_reason"]) + ), + f"Non-Intrusive Data Shows Cavity Extraction, built after {self.EMPTY_CAVITY_YEAR_THRESHOLD}: " + + self.standardised_asset_list["SAP Category"], + self.standardised_asset_list["cavity_reason"] + ) + ###################################################### # Flag solar ###################################################### @@ -1771,6 +1803,16 @@ class AssetList: self.standardised_asset_list["solar_reason"] ) + # Finally, anything flagged for solar should not be flagged for cavity - make them None + self.standardised_asset_list["cavity_reason"] = np.where( + ( + ~pd.isnull(self.standardised_asset_list["solar_reason"]) & + ~pd.isnull(self.standardised_asset_list["cavity_reason"]) + ), + None, + self.standardised_asset_list["cavity_reason"] + ) + # Flag anything that has existing outcomes if (self.outcomes is not None) and ("surveyed" in self.standardised_asset_list.columns): @@ -2170,7 +2212,7 @@ class AssetList: self.hubspot_data = programme_data - def flag_ecosurv(self, ecosurv_landlords=None): + def flag_ecosurv(self, ecosurv_landlords=None, landlords_to_ignore=None): """ This class will match ecosurv data to the asset list @@ -2193,6 +2235,11 @@ class AssetList: self.ecosurv["Landlord"].isin(landlord_references["Landlord"].values) ] + if landlords_to_ignore is not None: + landlord_ecosurv_data = landlord_ecosurv_data[ + ~landlord_ecosurv_data["Landlord"].isin(landlords_to_ignore) + ] + # Try and match to asset list matched = [] unmatched = [] @@ -2254,6 +2301,11 @@ class AssetList: # We now match matched = pd.DataFrame(matched) + # We'll possibly have duplicates here, where properties have been sold twice. Ww de-dupe + if matched[self.STANDARD_LANDLORD_PROPERTY_ID].duplicated().sum(): + # It doesn't matter too much which record we take + matched = matched.drop_duplicates(subset=[self.STANDARD_LANDLORD_PROPERTY_ID]) + self.standardised_asset_list = self.standardised_asset_list.merge( matched, how="left", @@ -2407,7 +2459,7 @@ class AssetList: self.outcomes.append(outcomes) lookup = pd.concat(lookup) - outcomes_no_match = pd.concat(outcomes_no_match) + self.outcomes_no_match = pd.concat(outcomes_no_match) self.outcomes = pd.concat(self.outcomes) if lookup.empty: @@ -2425,6 +2477,8 @@ class AssetList: date_col = "Survey Date" elif "Date letters sent" in self.outcomes.columns: date_col = "Date letters sent" + elif "Date Letter sent" in self.outcomes.columns: + date_col = "Date Letter sent" else: raise NotImplementedError("Invalid date in outcomes - implement me") @@ -2564,8 +2618,18 @@ class AssetList: axis=1 ) + scheme_col = ( + "AFFORDABLE WARMTH OR EPC FOR HOUSING ASSOCIATION" if + "AFFORDABLE WARMTH OR EPC FOR HOUSING ASSOCIATION" in master_data.columns else "AFFORDABLE WARMTH" + ) postcode_col = "POSTCODE" if "POSTCODE" in master_data.columns else "Post Code" house_no_col = 'NO.' if 'NO.' in master_data.columns else "NO" + property_type_col = ( + "PROPERTY TYPE As per table emailed" if + "PROPERTY TYPE As per table emailed" in + master_data.columns else "PROPERTY TYPE As per table emailed" + ) + measure_mix_col = "MEASURE COMBO" # Otherwise, we need to match algorithmically has_property_id = "UPRN" in master_data.columns @@ -2574,6 +2638,10 @@ class AssetList: unmatched = [] for _, row in tqdm(master_data.iterrows(), total=len(master_data)): + original_house_no = row[house_no_col] + original_street = row["Street / Block Name"] + original_postcode = row[postcode_col] + if pd.isnull(row[postcode_col]): continue @@ -2595,9 +2663,40 @@ class AssetList: ] house_no = row[house_no_col] - if isinstance(house_no, float): + if isinstance(house_no, (float, int)): house_no = str(int(house_no)) + if house_no not in df["house_no"].values: + # Handle postcode errors + postal_region = row[postcode_col].split(" ")[0].lower() + df = self.standardised_asset_list[ + ( + self.standardised_asset_list[self.STANDARD_POSTCODE] + .str.strip().str.lower().str.startswith(postal_region) + ) + ] + + if house_no not in df["house_no"].values: + unmatched.append(row["row_id"]) + continue + df = df[df["house_no"] == house_no] + if df.shape[0] > 1: + df = df[ + df[self.STANDARD_FULL_ADDRESS].str.lower().str.contains(row["Street / Block Name"].lower()) + ] + if df.shape[0] == 0: + unmatched.append(row["row_id"]) + continue + matched.append( + { + "row_id": row["row_id"], + "original_house_no": original_house_no, + "original_street": original_street, + "original_postcode": original_postcode, + self.STANDARD_LANDLORD_PROPERTY_ID: df[self.STANDARD_LANDLORD_PROPERTY_ID].values[0], + } + ) + if house_no in df["house_no"].values: df = df[df["house_no"] == house_no] if df.shape[0] != 1: @@ -2632,14 +2731,12 @@ class AssetList: ] if any( - df[self.STANDARD_PROPERTY_TYPE].str.contains( - row["PROPERTY TYPE As per table emailed"].split(" ")[-1].lower() - ) + df[self.STANDARD_PROPERTY_TYPE].str.contains(row[property_type_col].split(" ")[-1].lower()) ): # We ignore "block of flats" entries df = df[ df[self.STANDARD_PROPERTY_TYPE].str.contains( - row["PROPERTY TYPE As per table emailed"].split(" ")[-1].lower() + row[property_type_col].split(" ")[-1].lower() ) & (df[self.STANDARD_PROPERTY_TYPE] != "block of flats") ] @@ -2649,6 +2746,9 @@ class AssetList: matched.append( { "row_id": row["row_id"], + "original_house_no": original_house_no, + "original_street": original_street, + "original_postcode": original_postcode, self.STANDARD_LANDLORD_PROPERTY_ID: df[self.STANDARD_LANDLORD_PROPERTY_ID].values[0], } ) @@ -2657,10 +2757,12 @@ class AssetList: # We match the "UPRN" which is the landlords ID, onto the master sheet matched = pd.DataFrame(matched) - master_to_append = master_data[["row_id", install_col, submission_col]].merge( + master_to_append = master_data[[scheme_col, "row_id", install_col, submission_col, measure_mix_col]].merge( matched, how="left", on="row_id" ).rename( columns={ + scheme_col: "funding_scheme", + measure_mix_col: "measure_mix", install_col: "survey_status", submission_col: "submission_date" } @@ -2671,10 +2773,6 @@ class AssetList: master_data["row_id"].isin(unmatched) ] - scheme_col = ( - "AFFORDABLE WARMTH OR EPC FOR HOUSING ASSOCIATION" if - "AFFORDABLE WARMTH OR EPC FOR HOUSING ASSOCIATION" in master_data.columns else "AFFORDABLE WARMTH" - ) # The columns are massively different - we take just a few unmatched_df = unmatched_df[ [ diff --git a/asset_list/app.py b/asset_list/app.py index 76e09295..be2ef031 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -62,10 +62,42 @@ def app(): Property UPRN """ + # Community Housing new list + data_folder = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Community Housing/Programme " + "Reconciliation") + data_filename = "SUB EPC C to Domna.xlsx" + sheet_name = "Sheet1" + postcode_column = 'POSTCODE' + fulladdress_column = None + address1_column = "ADDRESS" + address1_method = None + address_cols_to_concat = ["ADDRESS", "ESTATE", "TOWN"] + missing_postcodes_method = None + landlord_year_built = "BUILD DATE" + landlord_os_uprn = None + landlord_property_type = "PROPERTY TYPE" + landlord_built_form = "PROPERTY TYPE" + landlord_wall_construction = "CONSTRUCTION TYPE" + landlord_roof_construction = None + landlord_heating_system = None + landlord_existing_pv = None + landlord_property_id = "UPRN" + landlord_sap = None + outcomes_filename = [] + outcomes_sheetname = [] + outcomes_postcode = [] + outcomes_houseno = [] + outcomes_id = [] + outcomes_address = [] + master_filepaths = [] + master_to_asset_list_filepath = None + phase = False + ecosurv_landlords = None + # Unitas data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Unitas" - data_filename = "UNITAS - Asset List.xlsx" - sheet_name = "Asset List" + data_filename = "unitas_asset_list_for_analysis.xlsx" + sheet_name = "Sheet1" postcode_column = 'Post Code' fulladdress_column = "Address Line 1" address1_column = "Address Line 1" @@ -611,7 +643,7 @@ def app(): epc_api_only = False force_retrieve_data = False skip = None # Used to skip already completed chunks - chunk_size = 1000 + chunk_size = 5000 filename = "Chunk {i}.csv" download_folder = os.path.join(data_folder, "Chunks") if not os.path.exists(download_folder): diff --git a/asset_list/mappings/built_form.py b/asset_list/mappings/built_form.py index 6ce31cdd..ffd698b3 100644 --- a/asset_list/mappings/built_form.py +++ b/asset_list/mappings/built_form.py @@ -250,5 +250,20 @@ BUILT_FORM_MAPPINGS = { 'House Homeless Unit': 'unknown', 'Flat ELEVENTH FLOOR': 'mid-floor', 'Flat TENTH FLOOR': 'mid-floor', - 'House. MT': 'mid-terrace' + 'House. MT': 'mid-terrace', + 'Ground Floor Bedsit': 'ground floor', + 'Mid Terrace With Passage': 'mid-terrace', + 'End Of Terrace': 'end-terrace', + 'Ground Floor Maisonette': 'ground floor', + 'First Floor Bedsit': 'mid-floor', + 'GROUND FLOOR BEDSIT': 'ground floor', + 'GROUND FLOOR FLAT': 'ground floor', + 'BUNGALOW': 'unknown', + 'HOUSE 1 LIVING ROOM': 'unknown', + 'MAISONETTE OVER SHOP': 'unknown', + 'SECOND FLOOR FLAT': 'mid-floor', + 'FIRST FLOOR FLAT': 'ground floor', + 'GROUND FL MAISONETTE': 'ground floor', + 'HOUSE 2 LIVING ROOMS': 'unknown', + 'FLAT OVER SHOP': 'unknown' } diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py index aceecd8f..463e2cef 100644 --- a/asset_list/mappings/heating_systems.py +++ b/asset_list/mappings/heating_systems.py @@ -5,7 +5,7 @@ STANDARD_HEATING_SYSTEMS = { "gas boiler, radiators", "electric storage heaters", "district heating", - "communal heating" + "communal heating", "gas condensing boiler", "oil boiler", "gas condensing combi", @@ -32,7 +32,7 @@ STANDARD_HEATING_SYSTEMS = { HEATING_MAPPINGS = { "Combi - GAS": "gas combi boiler", - "E7 Storage Heaters": "high heat retention storage heaters", + "E7 Storage Heaters": "electric storage heaters", "District heating system": "district heating", "Condensing Boiler - GAS": "gas condensing boiler", "Boiler Oil/other": "oil boiler", @@ -50,7 +50,7 @@ HEATING_MAPPINGS = { "Gas fire": "other", "Backboiler - Solid fuel": "other", 'combi - gas': 'gas combi boiler', - 'e7 storage heaters': 'high heat retention storage heaters', + 'e7 storage heaters': 'electric storage heaters', 'district heating system': 'district heating', 'condensing boiler - gas': 'gas condensing boiler', 'boiler oil/other': 'oil boiler', @@ -275,6 +275,18 @@ HEATING_MAPPINGS = { 'POTTERTON': 'gas combi boiler', 'BAXI SOLO': 'gas combi boiler', 'BAXI BERMUDA': 'gas combi boiler', - 'BAXI': 'gas combi boiler' + 'BAXI': 'gas combi boiler', + 'Combi Boiler': 'gas combi boiler', + 'Air Source Heat Pump': 'air source heat pump', + 'Dual Fuel': 'other', + 'Regular Boiler': 'gas condensing boiler', + 'No Main Heating': 'no heating', + 'None (via Communal System)': 'communal heating', + 'No Mains Heating': 'no heating', + 'Open Fire with Back Boiler': 'solid fuel', + 'No Gas Boiler': 'no heating', + 'Back Boiler': 'solid fuel', + "This cell has an external reference that can't be shown or edited. Editing this cell will remove the external " + "reference.": 'unknown' } diff --git a/asset_list/mappings/property_type.py b/asset_list/mappings/property_type.py index 303ba0b3..d455d312 100644 --- a/asset_list/mappings/property_type.py +++ b/asset_list/mappings/property_type.py @@ -218,6 +218,15 @@ PROPERTY_MAPPING = { 'Bungalow MT': 'bungalow', 'House MT': 'house', 'House. MT': 'house', - '': 'unknown' + '': 'unknown', + 'GROUND FLOOR BEDSIT': 'bedsit', + 'HOUSE 1 LIVING ROOM': 'house', + 'MAISONETTE OVER SHOP': 'maisonette', + 'GROUND FLOOR FLAT': 'flat', + 'SECOND FLOOR FLAT': 'flat', + 'FIRST FLOOR FLAT': 'flat', + 'GROUND FL MAISONETTE': 'maisonette', + 'HOUSE 2 LIVING ROOMS': 'house', + 'FLAT OVER SHOP': 'flat' } diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py index 1fb8cb79..5e32531f 100644 --- a/asset_list/mappings/walls.py +++ b/asset_list/mappings/walls.py @@ -212,12 +212,17 @@ WALL_CONSTRUCTION_MAPPINGS = { 'Cornish': 'system built', 'Rwate': 'system built', 'Hill Presweld Steel': 'system built', - 'Cavity Filled Cavity': 'filled cavity', 'Cavity Unknown': 'cavity unknown insulation', 'Cavity Filled Cavity (internal)': 'filled cavity', '': 'unknown', 'Cavity Internal Insulation': 'filled cavity', - 'Cavity As Built': "uninsulated cavity" - + 'Cavity As Built': "uninsulated cavity", + 'Non Trad Large Panel System': 'system built', + 'Non Trad Cornish': 'system built', + 'Non Trad Reema': 'system built', + 'Traditional Cavity Brickwork': 'cavity unknown insulation', + 'System build (undefined)': 'system built', + 'Non Trad Wimpey': 'system built', + 'Non Trad Wates': 'system built' } diff --git a/etl/customers/Community Housing/reconciliation.py b/etl/customers/Community Housing/reconciliation.py new file mode 100644 index 00000000..68e2b265 --- /dev/null +++ b/etl/customers/Community Housing/reconciliation.py @@ -0,0 +1,708 @@ +""" +This script is used to reconcile the data from the Community Housing project, to understand the differences in +the various asset lists, and the work that has been conducted +""" +import os +import pandas as pd +import numpy as np +from tqdm import tqdm +from asset_list.AssetList import AssetList +from backend.SearchEpc import SearchEpc + +# Data preparation +outcomes_1 = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Community Housing/Programme Reconciliation/Outcomes " + "Community Housing.xlsx", + sheet_name="Sheet1", +) +outcomes_2 = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Community Housing/Programme Reconciliation/Outcomes " + "Community Housing.xlsx", + sheet_name="ECO4 + PV", +) +outcomes_2["Type of Funding"] = "ECO4 Solar" + +combined_outcomes = pd.concat([outcomes_1, outcomes_2], ignore_index=True) +combined_outcomes.columns = [ + 'Surveyor', 'Housing Association', 'No.', 'Address', 'Postcode', 'Outcome', 'Type of Funding', "Notes", + 'Previous letter sent Date:', 'Date Letter sent', 'Installer' +] +# Store +combined_outcomes.to_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Community Housing/Programme " + "Reconciliation/combined_outcomes.xlsx", +) + +################################################################################################ +# Config for asset list standardisation +################################################################################################ + +data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Community Housing/Programme Reconciliation" +data_filename = "Community Housing - Original Asset List Copy for Reconciliation.xlsx" +sheet_name = "Assets" +postcode_column = 'Postcode' +fulladdress_column = "Full Address" +address1_column = None +address1_method = "house_number_extraction" +address_cols_to_concat = [] +missing_postcodes_method = None +landlord_year_built = "Build_Date" +landlord_os_uprn = None +landlord_property_type = "Asset_Type1" +landlord_built_form = "Asset_Classification" +landlord_wall_construction = None +landlord_roof_construction = None +landlord_heating_system = "Heat Source Static" +landlord_existing_pv = None +landlord_property_id = "Asset_Reference" +landlord_sap = None +outcomes_filename = [ + os.path.join(data_folder, "combined_outcomes.xlsx"), +] +outcomes_sheetname = ["Sheet1"] +outcomes_postcode = ["Postcode"] +outcomes_houseno = ["No."] +outcomes_id = [None] +outcomes_address = ["Address"] +master_filepaths = [ + os.path.join(data_folder, "Submissions - for analysis.csv"), +] +master_to_asset_list_filepath = None +phase = False +ecosurv_landlords = "community community|community housing|mr community|david lindwood" + +manual_uprn_map = {} + +asset_list = AssetList( + local_filepath=os.path.join(data_folder, data_filename), + header=0, + sheet_name=sheet_name, + address1_colname=address1_column, + postcode_colname=postcode_column, + landlord_property_id=landlord_property_id, + full_address_colname=fulladdress_column, + full_address_cols_to_concat=address_cols_to_concat, + missing_postcodes_method=missing_postcodes_method, + address1_extraction_method=address1_method, + landlord_year_built=landlord_year_built, + landlord_uprn=landlord_os_uprn, + landlord_property_type=landlord_property_type, + landlord_built_form=landlord_built_form, + landlord_wall_construction=landlord_wall_construction, + landlord_roof_construction=landlord_roof_construction, + landlord_heating_system=landlord_heating_system, + landlord_existing_pv=landlord_existing_pv, + landlord_sap=landlord_sap, + phase=phase +) +asset_list.init_standardise() + +asset_list.apply_standardiation() + +# We now flag properties that have been treated under existing programmes +asset_list.flag_outcomes( + outcomes_filepaths=outcomes_filename, + outcomes_sheetname=outcomes_sheetname, + outcomes_address=outcomes_address, + outcomes_postcode=outcomes_postcode, + outcomes_houseno=outcomes_houseno, + outcomes_id=outcomes_id +) + +if pd.isnull(asset_list.outcomes["domna_property_id"]).sum() == 1: + # We fix this one manually + asset_list.outcomes["domna_property_id"] = asset_list.outcomes["domna_property_id"].fillna( + "29walternashroadeastbirchencoppicekidderminsterdy117ea-caa3a8d92ea9" + ) +else: + raise Exception("Something went wrong") + +asset_list.flag_survey_master( + master_filepaths=master_filepaths, + master_to_asset_list_filepath=master_to_asset_list_filepath +) + +master_surveyed = asset_list.master_surveyed +scheme_map = { + "ECO4 A/W": "ECO4", + 'ECO4 GBIS': "GBIS", + 'ECO4 - REMEDIAL CWI ONLY': "ECO4 Remedial", + "ECO4 GBIS REMEDIAL": "GBIS Remedial", + 'ECO4 - Remedial CWI Only': "ECO4 Remedial", + 'ECO4 GBIS Remedial': "GBIS Remedial" +} +master_surveyed["funding_scheme"] = master_surveyed["funding_scheme"].map(scheme_map) +master_surveyed["survey_reference"] = master_surveyed["funding_scheme"] + ": " + master_surveyed["measure_mix"] +master_surveyed = master_surveyed.merge( + asset_list.standardised_asset_list[["domna_property_id", "landlord_property_id"]], + how="left", + on="landlord_property_id", +) +if pd.isnull(master_surveyed["domna_property_id"]).sum(): + raise ValueError("Some of the master surveyed properties do not have a domna_property_id") +# Flag anything in outcomes that has been listed as surveyed, that is NOT in the master_surveyed sheet +surveyed_outcomes = asset_list.outcomes[ + asset_list.outcomes["Outcome"].isin(["surveyed", "surveyed"]) +] +outcomes_not_in_master = surveyed_outcomes[ + ~surveyed_outcomes["domna_property_id"].isin(master_surveyed["domna_property_id"]) +] +outcomes_not_in_master["Type of Funding"] = outcomes_not_in_master["Type of Funding"].fillna("Work Type Not Filled In") + +asset_list.flag_ecosurv( + ecosurv_landlords=ecosurv_landlords, + landlords_to_ignore=[ + "Watford Community housing", "Eastlight Community housing", "Mr Tower Hamlets Community Housing" + ] +) + +# These are properties NOT on the Community Housing asset list that were sold under the wrong HA +# asset_list.ecosurv_no_match.to_csv( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Community Housing/Programme " +# "Reconciliation/Ecosurv - properties sold to Community Housing, not belonging to them.csv", +# index=False +# ) + +# We read in the works, split by sold to SGEC and on-hold +billed_to_installer = pd.read_csv( + os.path.join( + data_folder, "Community Housing Deck of works", "SGEC BILLED -Table 1.csv" + ), +) +billed_to_installer["billed"] = True + +not_billed_to_installer = pd.read_csv( + os.path.join( + data_folder, "Community Housing Deck of works", "ON HOLD -Table 1.csv" + ), +) +not_billed_to_installer["billed"] = False + +sgec_billings = pd.concat( + [billed_to_installer, not_billed_to_installer], +) +sgec_billings = sgec_billings.reset_index(drop=True) +sgec_billings["row_id"] = sgec_billings.index + +# We match these two lists back to the domna_property_id. They SHOULD match to submissions +scheme_col = ( + "AFFORDABLE WARMTH OR EPC FOR HOUSING ASSOCIATION" if + "AFFORDABLE WARMTH OR EPC FOR HOUSING ASSOCIATION" in billed_to_installer.columns else "AFFORDABLE WARMTH" +) +postcode_col = "POSTCODE" if "POSTCODE" in billed_to_installer.columns else "Post Code" +house_no_col = 'NO.' if 'NO.' in billed_to_installer.columns else "NO" +property_type_col = ( + "PROPERTY TYPE As per table emailed" if + "PROPERTY TYPE As per table emailed" in + billed_to_installer.columns else "PROPERTY TYPE As per table emailed" +) +measure_mix_col = "MEASURE COMBO" +manual_corrections = { + "30+DY12 1EB": "73440300", + "32+DY12 1EB": "73440320", + "1+DY11 7ES": "20150010", + "12+DY11 7EP": "9460120", + "72+DY11 7PA": "88520720", + "39+DY13 0DR": "44250390", + "43+DY11 7EF": "2460430", + "45+DY11 7EG": "2460450", + "47+DY11 7EG": "2460470", + "49+DY11 7EG": "2460490", + "11+DY13 0HB": "87320110", + "4+DY130HA": "87320040" +} +billed_lookup = [] +for _, row in tqdm(sgec_billings.iterrows(), total=len(sgec_billings)): + postcode = row["Post Code"] + houseno = row["NO."] + + # We need to correct some records + if manual_corrections.get("+".join([houseno, postcode])): + landlord_pid = manual_corrections["+".join([houseno, postcode])] + df = asset_list.standardised_asset_list[ + (asset_list.standardised_asset_list["landlord_property_id"] == landlord_pid) + ] + if df.shape[0] != 1: + raise ValueError(f"More than one match found for {landlord_pid} in the standardised asset list") + billed_lookup.append( + { + "domna_property_id": df["domna_property_id"].values[0], + "row_id": row["row_id"], + } + ) + continue + + df = master_surveyed[ + (master_surveyed["original_house_no"] == houseno) & + (master_surveyed["original_postcode"] == postcode) + ] + if df.shape[0] != 1: + # Try a search on the asset list + postcode_no_space = row[postcode_col].strip().replace(" ", "").lower() + + df = asset_list.standardised_asset_list[ + ( + asset_list.standardised_asset_list[asset_list.STANDARD_POSTCODE] + .str.strip().str.lower().str.replace(" ", "") == postcode_no_space + ) + ].copy() + + house_no = row[house_no_col] + if isinstance(house_no, float): + house_no = str(int(house_no)).lower() + else: + house_no = str(house_no).lower() + + df["house_no"] = df.apply( + lambda x: SearchEpc.get_house_number( + str(x[asset_list.STANDARD_ADDRESS_1]), str(x[asset_list.STANDARD_POSTCODE]) + ), + axis=1 + ) + df = df[df["house_no"].str.lower() == house_no].copy() + + if df.shape[0] == 1: + billed_lookup.append( + { + "domna_property_id": df["domna_property_id"].values[0], + "row_id": row["row_id"], + } + ) + continue + + raise ValueError(f"More than one match found for {'+'.join([houseno, postcode])} in the master surveyed list") + + billed_lookup.append( + { + "domna_property_id": df["domna_property_id"].values[0], + "row_id": row["row_id"], + } + ) + +billed_lookup = pd.DataFrame(billed_lookup) + +sgec_billings = sgec_billings.merge( + billed_lookup, + how="left", + on="row_id" +) + +# We get the asset list that Community Housing thinks they sent Warmfront + +master_data_sheet = pd.read_excel( + os.path.join( + data_folder, "Warmfront.xlsx" + ), + sheet_name="Asset Stock List (3)", +) +master_data_sheet["Asset_Reference"] = master_data_sheet["Asset_Reference"].astype(str) + +# 1) We check that all of the properties in the asset list we have on file are in the asset list that Community Housing +# believe they sent Warmfront +if not asset_list.standardised_asset_list[ + ~asset_list.standardised_asset_list["landlord_property_id"].isin( + master_data_sheet["Asset_Reference"].astype(str).values + ) +].empty: + raise ValueError("Some of the properties in the asset list are not in the Warmfront asset list") + +# This column documents whether or not the property is in the asset list that the WFT were sent +# There are 189 properties that were never sent to WFT, but all properties are accounted for in the asset list +master_data_sheet["Is Property in WFT Asset List"] = master_data_sheet["Asset_Reference"].astype(str).isin( + asset_list.standardised_asset_list["landlord_property_id"].astype(str).values +) + +# We now merge on the Warmfront findings +master_data_sheet = master_data_sheet.merge( + asset_list.standardised_asset_list[["landlord_property_id", "non-intrusives: ECO Eligibility"]], + how="left", + left_on="Asset_Reference", + right_on="landlord_property_id" +) +master_data_sheet["non-intrusives: ECO Eligibility"] = master_data_sheet["non-intrusives: ECO Eligibility"].fillna( + "Not in original asset list" +) + +# SGEC did a number of CIGA checks. We match these onto the master data sheet + +# TODO: Need to split the programme into historical 2023 and 2024 (there was a cutoff data in late 2024 which seemed +# to be the start of the new programme +# Seems like there were 2 main checks - it also seems like this was a 2 phase programme, where these CIGA checks +# correspond to phase 2 +ciga_checks_1 = pd.read_excel( + os.path.join( + data_folder, "CIGA Checks", "2 CIGA Check WFT 14102024 x1073.xlsx" + ), + sheet_name="Worksheet" +) +ciga_checks_1 = ciga_checks_1[~pd.isnull(ciga_checks_1["Postcode"])] +ciga_checks_1["request"] = "1073 properties" +ciga_checks_2 = pd.read_excel( + os.path.join( + data_folder, "CIGA Checks", "2 CIGA Check 01112024 x125.xlsx" + ), + sheet_name="Worksheet" +) +ciga_checks_2 = ciga_checks_2[~pd.isnull(ciga_checks_2["Postcode"])] +ciga_checks_2["request"] = "125 flats" + +cigas = pd.concat([ciga_checks_1, ciga_checks_2], ignore_index=True) +cigas["row_id"] = cigas.index + +# We add some temp columns to allow for easier matching +asset_list.standardised_asset_list["house_no"] = asset_list.standardised_asset_list.apply( + lambda x: SearchEpc.get_house_number( + str(x["domna_full_address"]), str(x["domna_postcode"]) + ), + axis=1 +) + +manual_fixes = { + "2 Austcliffe Road Cookley, Kidderminster": "2250020", + '5 Brett Young Close, Kidderminster': "9800050" +} +incorrect_ciga_return = [ + "19 Wood Street, Kidderminster", + "nan Charles Street", + "53 Harold Evers Way, Kidderminster", + '63 Harold Evers way' +] +ciga_lookup = [] +for _, row in tqdm(cigas.iterrows(), total=len(cigas)): + + if manual_fixes.get(row["Matched Address"]): + ll_pid = manual_fixes[row["Matched Address"]] + df = asset_list.standardised_asset_list[ + (asset_list.standardised_asset_list["landlord_property_id"] == ll_pid) + ] + ciga_lookup.append( + { + "domna_property_id": df["domna_property_id"].values[0], + "row_id": row["row_id"], + } + ) + continue + + if (row["Matched Address"] in incorrect_ciga_return) or ( + " ".join([str(row["Address1"]), row["Address2"]]) in incorrect_ciga_return + ): + continue + + df = asset_list.standardised_asset_list[ + (asset_list.standardised_asset_list["domna_postcode"] == row["Postcode"]) + ] + + df = df[(df["house_no"].astype(str) == str(row["Address1"]))] + + if df.empty: + df = asset_list.standardised_asset_list[ + (asset_list.standardised_asset_list["domna_postcode"] == row["Matched Postcode"]) + ] + df = df[(df["house_no"].astype(str) == str(row["Address1"]))] + + if df.shape[0] > 1: + df = asset_list.standardised_asset_list[ + (asset_list.standardised_asset_list["domna_full_address"].str.lower().str.replace(",", "").str.contains( + row["Matched Address"].lower().replace(",", ""), na=False)) + ] + if df.empty: + df = asset_list.standardised_asset_list[ + (asset_list.standardised_asset_list["domna_full_address"].str.lower().str.replace(",", "").str.contains( + row["Address2"].lower().replace(",", ""), na=False)) + ] + + df = df[(df["house_no"].astype(str) == str(row["Address1"]))] + + if df.shape[0] != 1: + raise Exception("More than one match found for {row['Address1']} in the asset list") + + ciga_lookup.append( + { + "domna_property_id": df["domna_property_id"].values[0], + "row_id": row["row_id"], + } + ) + +ciga_lookup = pd.DataFrame(ciga_lookup) + +cigas = cigas.merge( + ciga_lookup, + how="left", + on="row_id" +) +cigas = cigas[~pd.isnull(cigas["domna_property_id"])] + +cigas = cigas.merge( + asset_list.standardised_asset_list[["domna_property_id", "landlord_property_id"]], + how="left", + on="domna_property_id" +) + +# Note 4 entries in the CIGA checks did NOT match to the asset list (were for properties not owned by Community Housing) +master_data_sheet = master_data_sheet.merge( + cigas[["landlord_property_id", "Guarantee", "request"]].rename( + columns={"request": "CIGA request batch"} + ), + how="left", + on="landlord_property_id" +) + +# Fill missing survey_reference with funding_scheme +master_surveyed["survey_reference"] = master_surveyed["survey_reference"].fillna( + master_surveyed["funding_scheme"] +) + +master_surveyed_to_merge = master_surveyed[ + ~master_surveyed["domna_property_id"].isin(sgec_billings["domna_property_id"].values) +] +master_surveyed_to_merge["Survey Status"] = "Surveyed, Submitted, not on SGEC Deck of Works" + +# We now merge on what we've surveyed and submitted +master_data_sheet = master_data_sheet.merge( + master_surveyed_to_merge[ + ["landlord_property_id", "survey_reference", "submission_date", "cancelled", "Survey Status"] + ].rename( + columns={ + "survey_reference": "Survey Type", "submission_date": "Survey Date", + "cancelled": "Was the Install Cancelled?" + } + ), + how="left", + on="landlord_property_id" +) + +# We now deduce the status of the work based on sgec_billings +sgec_billings = sgec_billings.merge( + asset_list.standardised_asset_list[["landlord_property_id", "domna_property_id"]], + how="left", + on="domna_property_id" +) + +dupe_ids = sgec_billings[sgec_billings["domna_property_id"].duplicated()]["domna_property_id"] +# We sort by domna_property_id and billed (where true should be first) and take the first instance +sgec_billings = sgec_billings.sort_values( + ["domna_property_id", "billed"], ascending=[True, False] +) +sgec_billings = sgec_billings.drop_duplicates( + subset=["domna_property_id"], + keep="first" +) + +sgec_billings["Survey Type"] = ( + sgec_billings["SUBMISSION TYPE - ECO4,GBIS,SHDF,EPC or OTHER"].map(scheme_map) + ": " + + sgec_billings["MEASURE COMBO"] +) +sgec_billings["Survey Type"] = sgec_billings["Survey Type"].fillna( + sgec_billings["SUBMISSION TYPE - ECO4,GBIS,SHDF,EPC or OTHER"].map(scheme_map) +) +sgec_billings["Survey Date"] = sgec_billings['SUBMISSION DATE'] +sgec_billings["Was the Install Cancelled?"] = ( + sgec_billings["INSTALLED"].astype(str).str.lower().str.contains("cancel") +) + +sgec_billings['Survey Status'] = np.where( + sgec_billings["billed"] == True, + "Surveyed, Submitted, on SGEC Deck of Works", + "Surveyed, not submitted to SGEC, on SGEC Deck of Works" +) + +master_data_sheet = master_data_sheet.merge( + sgec_billings[ + ["landlord_property_id", "Survey Type", "Survey Date", "Was the Install Cancelled?", "Survey Status"]], + how="left", + on="landlord_property_id", + suffixes=("", "_y") +) + +for col in ["Survey Type", "Survey Date", "Was the Install Cancelled?", "Survey Status"]: + master_data_sheet[col] = np.where( + pd.isnull(master_data_sheet[col]) & ~pd.isnull(master_data_sheet[col + "_y"]), + master_data_sheet[col + "_y"], + master_data_sheet[col] + ) + master_data_sheet = master_data_sheet.drop(columns=[col + "_y"]) + +outcomes_not_in_master = outcomes_not_in_master.merge( + asset_list.standardised_asset_list[["landlord_property_id", "domna_property_id"]], + how="left", + left_on="domna_property_id", + right_on="domna_property_id" +) +# We also filter out any that were in the SGEC billings +outcomes_not_in_master = outcomes_not_in_master[ + ~outcomes_not_in_master["domna_property_id"].isin(sgec_billings["domna_property_id"].values) +] + +# We now merge on outcomes. There are a small number of surveyed outcomes that were not submitted +master_data_sheet = master_data_sheet.merge( + outcomes_not_in_master[["landlord_property_id", 'Type of Funding', "Date Letter sent"]], + how="left", + on="landlord_property_id", +) +master_data_sheet["Survey Status"] = np.where( + pd.isnull(master_data_sheet["Survey Type"]) & ~pd.isnull(master_data_sheet["Type of Funding"]), + "Surveyed, On Outcomes, not submitted", + master_data_sheet["Survey Status"] +) + +master_data_sheet["Survey Type"] = np.where( + pd.isnull(master_data_sheet["Survey Type"]) & ~pd.isnull(master_data_sheet["Type of Funding"]), + master_data_sheet["Type of Funding"], + master_data_sheet["Survey Type"] +) +master_data_sheet["Survey Date"] = np.where( + pd.isnull(master_data_sheet["Survey Date"]) & ~pd.isnull(master_data_sheet["Date Letter sent"]), + master_data_sheet["Date Letter sent"], + master_data_sheet["Survey Date"] +) +master_data_sheet = master_data_sheet.drop(columns=["Type of Funding", "Date Letter sent"]) + +# We now need to compare the submissions that SGEC have sent us, because the deck of works is likely incorrect given +# given the number of properties that have been received by SGEC + +# We have submissions from the following dates: +# - 18/11/2024 +# - 10/03/2024 +# - A sheet that claims to be 25/11/2024 but has 18/11/2024 as the submission date +# - 16/12/2025 +# - 02/12/2024 +# - 10/02/2025 +sgec_received_submissions = [] +for filename in [ + "4x108 18.11.24 - RT MASTERS SGEC INVOICE.xlsx", + "4x144 COMMUNITY HOUSING TOTAL PROJECT INV 10032025.xlsx", + "4x19 25.11.2024 - RT Master SGEC.xlsx", + "4x37 16.12.2024 - SGEC INVOICED.xlsx", + "4x60 02.12.2024 - RT SGEC INV.xlsx", + "4x78 10.02.2025 MASTERS - SGEC INVOICED-CORRECT.xlsx" +]: + data = pd.read_excel( + os.path.join( + data_folder, "SGEC Received Submissions", filename + ), + ) + data["filename"] = filename + sgec_received_submissions.append(data) + +sgec_received_submissions = pd.concat(sgec_received_submissions) +sgec_received_submissions = sgec_received_submissions.reset_index(drop=True) +sgec_received_submissions["row_id"] = sgec_received_submissions.index + +manual_fix = { + "5a+DY10 3JR": "6856005A", + '12+DY10 3JR': "78900120", + "9+DY10 3JR": "86280090", + '10+DY10 3JL': "86280100", + "66+DY10 3JS": "68560660", + "70+DY10 3JS": "68560700", + "72+DY10 3JS": "68560720", + "12+DY10 3JP": "86280120", + "2A+DY11 5TZ": "6872002A", + "3A+DY11 5TZ": "6872003A", + "4A+DY11 5TZ": "6872004A" +} +sgec_received_submissions_lookup = [] +for _, row in tqdm(sgec_received_submissions.iterrows(), total=len(sgec_received_submissions)): + + _key = "+".join([str(row["NO."]), str(row["Post Code"])]) + + if manual_fix.get(_key) is not None: + ll_pid = manual_fix[_key] + sgec_received_submissions_lookup.append( + { + "row_id": row["row_id"], + "landlord_property_id": ll_pid, + } + ) + continue + + match = sgec_billings[ + (sgec_billings['NO.'].astype(str) == str(row['NO.'])) & + (sgec_billings['Post Code'] == row['Post Code']) + ] + + if match.shape[0] > 1: + raise Exception(f"something went wrong {_key} {row['Street / Block Name']}") + + if match.shape[0] == 1: + sgec_received_submissions_lookup.append( + { + "row_id": row["row_id"], + "landlord_property_id": match["landlord_property_id"].values[0], + } + ) + continue + + match = master_surveyed[ + (master_surveyed['original_house_no'].astype(str) == str(row['NO.'])) & + (master_surveyed['original_postcode'] == row['Post Code']) + ] + + if match.shape[0] > 1: + raise Exception(f"something went wrong 2 {_key} {row['Street / Block Name']}") + + if match.shape[0] == 0: + raise Exception(f"No match {_key} {row['Street / Block Name']}") + + sgec_received_submissions_lookup.append( + { + "row_id": row["row_id"], + "landlord_property_id": match["landlord_property_id"].values[0], + } + ) + +sgec_received_submissions_lookup = pd.DataFrame(sgec_received_submissions_lookup) +sgec_received_submissions = sgec_received_submissions.merge( + sgec_received_submissions_lookup[["row_id", "landlord_property_id"]], + how="left", + on="row_id" +) + +sgec_received_submissions["Survey Type"] = ( + sgec_received_submissions["SUBMISSION TYPE - ECO4,GBIS,SHDF,EPC or OTHER"].map(scheme_map) + ": " + + sgec_received_submissions["MEASURE COMBO"] +) + +sgec_received_submissions["Survey Type"] = sgec_received_submissions["Survey Type"].fillna( + sgec_received_submissions["SUBMISSION TYPE - ECO4,GBIS,SHDF,EPC or OTHER"].map(scheme_map) +) +sgec_received_submissions["Survey Date"] = sgec_received_submissions['SUBMISSION DATE'] +sgec_received_submissions["Was the Install Cancelled?"] = ( + sgec_received_submissions["INSTALLED"].astype(str).str.lower().str.contains("cancel") +) +sgec_received_submissions['Survey Status'] = "Submission sent to SGEC, Confirmed by SGEC" +sgec_received_submissions["Survey Received by SGEC"] = True + +# We now merge on the submissions that SGEC have sent us +master_data_sheet = master_data_sheet.merge( + sgec_received_submissions[ + [ + "landlord_property_id", "Survey Type", "Survey Date", "Was the Install Cancelled?", "Survey Status", + "Survey Received by SGEC" + ] + ], + how="left", + on="landlord_property_id", + suffixes=("", "_y") +) + +# Fill in the gaps +for col in ["Survey Type", "Survey Date", "Was the Install Cancelled?", "Survey Status"]: + master_data_sheet[col] = np.where( + pd.isnull(master_data_sheet[col]) & ~pd.isnull(master_data_sheet[col + "_y"]), + master_data_sheet[col + "_y"], + master_data_sheet[col] + ) + master_data_sheet = master_data_sheet.drop(columns=[col + "_y"]) + +if master_data_sheet["Asset_Reference"].duplicated().sum(): + raise ValueError("There are duplicates in the asset reference column") + +# Drop this at the end +master_data_sheet = master_data_sheet.drop(columns=["landlord_property_id"]) + +master_data_sheet.to_excel( + os.path.join( + data_folder, "Draft Results.xlsx" + ), +) diff --git a/etl/customers/Westward/Route March Reconciliation.py b/etl/customers/Westward/Route March Reconciliation.py new file mode 100644 index 00000000..1f160bc9 --- /dev/null +++ b/etl/customers/Westward/Route March Reconciliation.py @@ -0,0 +1,51 @@ +import pandas as pd + +tabs = [ + "Straight Fill", "Solar PV - Straight Fill", "RDF CIGA checks", "Solar PV - RDF CIGA Checks", + "AT BUILD", "Solar PV - AT BUILD" +] + +programme_revisions = [] +for tab in tabs: + original_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward/Route March/WESTWARD - Route March Prep.xlsx", + sheet_name=tab, + ) + + revised_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward/Route March/WESTWARD - GBIS List revised for " + "Domna.xlsx", + sheet_name=tab, + ) + revised_list["Client Review"] = "Retain in programme" + + df = original_list[["Place ref"]].copy() + df["Tab"] = tab + + df = df.merge(revised_list[["Place ref", "Client Review"]], how="left", on="Place ref") + df["Client Review"] = df["Client Review"].fillna("Remove from programme") + + programme_revisions.append(df) + +programme_revisions = pd.concat(programme_revisions) + +# Read in the standardised asset list and create the column to append to that +al = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward/WESTWARD - completed list - " + "08.05.2025 - Standardised - Client Review.xlsx", + sheet_name="Standardised Asset List", +) + +client_revisions = al[["landlord_property_id"]].merge( + programme_revisions[["Place ref", "Client Review"]], + how="left", + left_on="landlord_property_id", + right_on="Place ref", +) + +client_revisions["Client Review"] = client_revisions["Client Review"].fillna("Needs Review") +client_revisions["Client Review Date"] = "08/05/2025" + +client_revisions.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward/Route March/client_revisions.csv", index=False +) diff --git a/etl/customers/l_and_g/risk_matrix.py b/etl/customers/l_and_g/risk_matrix.py index bc1bc952..c800117e 100644 --- a/etl/customers/l_and_g/risk_matrix.py +++ b/etl/customers/l_and_g/risk_matrix.py @@ -20,9 +20,9 @@ def app(): "ventilation": 350, "Room Roof Insulation": 210, "Loft insulation": 15, - "Internal wall insulation": 215, + "Internal wall insulation": 131, "External wall insulation": 298.35, - "Solid wall insulation": 215, + "Solid wall insulation": 131, "LEDs": 35, # per light "Flat Roof Insulation": 195, "Double Glazing": 1140, @@ -71,149 +71,10 @@ def app(): "Ground Floor Flat": 10 } - # If we have a flat, we won't use the 199m2 floor area - floor_areas = [73, 97, 199] - # We remove age bracket, as we ended up with 360 combinations - # age_brackets = ["1945-1970", "1971-2002", "Post 2002"] - wall_type = ["cavity", "non-cavity"] - roof_type = ["pitched", "other"] - planning_constraints = [True, False] - - # This is the list of all combinations of the above variables - combinations_untrimmed = product( - *[ - dwelling_types, floor_areas, wall_type, roof_type, planning_constraints - ] - ) - - # TODO: Possibly need to add an additional cost for immersion hot water - combinations = [] - for comb in combinations_untrimmed: - if "Flat" in comb[0] and comb[1] == 199: - continue - - # If we have a flat, not too much difference if it's in a conservation area or not - if "Flat" in comb[0] and comb[4] is True: - continue - combinations.append(comb) - - risk_matrix = [] - for combination in combinations: - n_floors = num_floors_map[combination[0]] - bf = built_form_map[combination[0]] - pt = "House" if "Flat" not in combination[0] else "Flat" - # Model the home as a box - ground_floor_area = combination[1] / n_floors - perimeter = np.sqrt(ground_floor_area) * 4 - - # This is the amount of insulation required - external_wall_area = estimate_external_wall_area( - num_floors=n_floors, - floor_height=2.5, - perimeter=perimeter, - built_form=bf - ) - - n_rooms = np.floor(combination[1] / 15) - - n_windows = estimate_windows( - property_type=pt, - built_form=bf, - construction_age_band="", - floor_area=combination[1], - number_habitable_rooms=n_rooms - ) - - # We determine the exact upgrade pathway for this combination, guided by the generic upgrade pathway - combination_upgrade_pathway = [] - for upgrade in upgrade_path: - if upgrade == "wall_insulation": - if combination[2] == "cavity": - combination_upgrade_pathway.append("cavity_wall_insulation") - else: - combination_upgrade_pathway.append("solid_wall_insulation") - continue - - if upgrade == "roof_insulation": - if combination[3] == "pitched": - combination_upgrade_pathway.append("loft_insulation") - else: - combination_upgrade_pathway.append("non_pitched_roof_insualtion") - continue - - if upgrade == "ventilation": - combination_upgrade_pathway.append("ventilation") - continue - - if upgrade == "low_energy_lighting": - combination_upgrade_pathway.append("low_energy_lighting") - continue - - if upgrade == "windows": - if not combination[4]: - combination_upgrade_pathway.append("double_glazing") - else: - combination_upgrade_pathway.append("secondary_glazing") - continue - - if upgrade == "heating": - if combination[0] in ["Semi Detached House", "Detached House"]: - combination_upgrade_pathway.append("high_heat_retention_storage") - else: - combination_upgrade_pathway.append("air_source_heat_pump") - continue - - if upgrade == "solar": - if combination[0] in ["Semi Detached House", "Detached House", "Mid Terrace House"]: - combination_upgrade_pathway.append("solar_pv") - continue - - combination_costs = [] - for measure in combination_upgrade_pathway: - unit_cost = pricing_matrix[measure] - # Wall insulation - if measure in ["cavity_wall_insulation", "internal_wall_insulation", "external_wall_insulation"]: - cost = unit_cost * external_wall_area - elif measure in ["loft_insulation"]: - cost = unit_cost * ground_floor_area - elif measure == "ventilation": - if combination[1] == 73: - cost = unit_cost * 2 - elif combination[1] == 97: - cost = unit_cost * 3 - else: - cost = unit_cost * 4 - elif measure == "low_energy_lighting": - n_lights = lighting_count[combination[0]] - if combination[1] == 73: - inflation = 1 - elif combination[1] == 97: - inflation = 1.2 - else: - inflation = 1.5 - cost = unit_cost * n_lights * inflation - elif measure in ["double_glazing", "secondary_glazing"]: - cost = unit_cost * n_windows - elif measure == "high_heat_retention_storage": - cost = unit_cost * n_rooms - elif measure in ["air_source_heat_pump", "solar_pv"]: - cost = unit_cost - else: - raise NotImplementedError("Implement: %s" % measure) - - combination_costs.append( - { - "measure": measure, - "cost": cost - } - ) - - combination_costs = pd.DataFrame(combination_costs) - contingency = 0.26 epr_data = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/L&G/Risk Matrix/EPR Data.xlsx", header=1 + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/L&G/Risk Matrix/EPR Data V2.xlsx", header=1 ) epr_data["Measure added"].value_counts() epr_data["row_id"] = epr_data.index @@ -318,6 +179,6 @@ def app(): ) with pd.ExcelWriter( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/L&G/Risk Matrix/risk_matrix.xlsx") as writer: + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/L&G/Risk Matrix/risk_matrix V2.xlsx") as writer: risk_matrix.to_excel(writer, sheet_name="Risk Matrix", index=False) pricing_df.to_excel(writer, sheet_name="Pricing Assumptions", index=False) diff --git a/etl/customers/mhs/prepare_data.py b/etl/customers/mhs/prepare_data.py new file mode 100644 index 00000000..1b47fefb --- /dev/null +++ b/etl/customers/mhs/prepare_data.py @@ -0,0 +1,60 @@ +""" +The data held on file for MHS is fairly incomplete, where not every single property has an observation +""" +from tqdm import tqdm +import pandas as pd +from docutils.utils.math.tex2mathml_extern import blahtexml + +asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MHS/MHS HOMES (Full Asset List) - for analysis.xlsx", + sheet_name="Data" +) +# When this list was checked, an observation was made per postcode, and so we need to extrapolate those findings +inspections_observatons = asset_list[["UPRN", "Postcode", "ManagementGroup", "WFT Findings"]].copy() + +populated = [] +for _, group in tqdm(inspections_observatons.groupby("Postcode"), + total=len(inspections_observatons.groupby("Postcode"))): + + if all(pd.isnull(group["WFT Findings"])): + group["WFT Findings"] = "Property not inspected" + populated.append(group) + continue + + fill_observation = group["WFT Findings"].values[0] + if pd.isnull(fill_observation): + group["WFT Findings"] = group["WFT Findings"].fillna("Property not inspected") + populated.append(group) + continue + + group = group.reset_index(drop=True) + + group_filled = [] + for idx, x in group.iterrows(): + if idx == 0: + group_filled.append(x) + continue + + new_value = x["WFT Findings"] + if not pd.isnull(new_value): + fill_observation = new_value + + x["WFT Findings"] = fill_observation + group_filled.append(x) + + group_filled = pd.DataFrame(group_filled) + + populated.append(group_filled) + +populated = pd.concat(populated) + +missed = populated[~populated["UPRN"].isin(asset_list["UPRN"].values)] + +asset_list = asset_list.drop(columns=["WFT Findings"]).merge( + populated.drop(columns=["Postcode", "ManagementGroup"]), how="left", on="UPRN" +) + +# Store the data +asset_list.to_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MHS/MHS HOMES (Full Asset List) - for programme build.xlsx" +) diff --git a/etl/customers/unitas/preparing_programme_rebuild.py b/etl/customers/unitas/preparing_programme_rebuild.py new file mode 100644 index 00000000..f4e5642a --- /dev/null +++ b/etl/customers/unitas/preparing_programme_rebuild.py @@ -0,0 +1,28 @@ +""" +Simple script to tidy up the unitas asset list +""" +import pandas as pd + +df = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Unitas/UNITAS - Asset List.xlsx", + sheet_name="Asset List" +) +df["Warmfront Finding"] = df["Warmfront Finding"].str.lower().str.strip() + +mapping = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Unitas/unitas-mapped-categories.csv", +) + +al = df.merge( + mapping[["non-intrusives: WFT Findings", 'mapped_category']].rename( + columns={"mapped_category": "WFT Findings"} + ), + how="left", + left_on="Warmfront Finding", + right_on="non-intrusives: WFT Findings" +) + +al.to_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Unitas/unitas_asset_list_for_analysis.xlsx", + index=False +) From 30847ded90ef5c7b7442da62a5b60bd5d74833f7 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 13 May 2025 11:53:38 +0100 Subject: [PATCH 6/6] debugging incorrect fetching of flat data --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- asset_list/AssetList.py | 12 +- asset_list/app.py | 541 ++++--------------------- asset_list/mappings/built_form.py | 69 +++- asset_list/mappings/heating_systems.py | 5 +- asset_list/mappings/property_type.py | 28 +- backend/SearchEpc.py | 33 +- etl/customers/mhs/flag_pilot.py | 134 ++++++ 9 files changed, 355 insertions(+), 471 deletions(-) create mode 100644 etl/customers/mhs/flag_pilot.py diff --git a/.idea/Model.iml b/.idea/Model.iml index 96ad7a95..df6c4faa 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index fb10c6b0..50cad4ca 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 94c3c235..b7dd8d70 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -739,6 +739,11 @@ class AssetList: self.standardised_asset_list[self.landlord_year_built].replace(self.DATETIME_REMAP) ) + no_data_codes = {"No Data": None} + self.standardised_asset_list[self.landlord_year_built] = ( + self.standardised_asset_list[self.landlord_year_built].replace(no_data_codes) + ) + self.standardised_asset_list[self.landlord_year_built] = pd.to_datetime( self.standardised_asset_list[self.landlord_year_built] ) @@ -759,7 +764,8 @@ class AssetList: "This cell has an external reference that can't be shown or edited. Editing this cell will " "remove the external reference.", "ND", - 'PIMSS EMPTY' + 'PIMSS EMPTY', + "UNKNOWN" ] if pd.isnull(date_str) or date_str in known_errors or (date_str == 0): @@ -1229,11 +1235,11 @@ class AssetList: elif self.old_format_non_intrusives_present: non_intrusives_wall_filter = ( self.standardised_asset_list['non-intrusives: WFT Findings'].str.lower().str.strip().isin( - ["empty cavity", "partial fill"] + ["empty cavity", "partial fill", "empty", "EMPTY CAVITY 70MM", "partial"] ) | ( ( self.standardised_asset_list['non-intrusives: WFT Findings'] - .str.lower().str.strip().str.contains("empty cavity|partial fill") & + .str.lower().str.strip().str.contains("empty cavity|partial fill|empty|partial") & ~self.standardised_asset_list['non-intrusives: WFT Findings'] .astype(str).str.lower().str.strip().str.contains("major access issues") ) diff --git a/asset_list/app.py b/asset_list/app.py index be2ef031..d5ce7226 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -62,22 +62,83 @@ def app(): Property UPRN """ - # Community Housing new list - data_folder = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Community Housing/Programme " - "Reconciliation") - data_filename = "SUB EPC C to Domna.xlsx" - sheet_name = "Sheet1" - postcode_column = 'POSTCODE' - fulladdress_column = None - address1_column = "ADDRESS" - address1_method = None - address_cols_to_concat = ["ADDRESS", "ESTATE", "TOWN"] + # Thurrock + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Thurrock" + data_filename = "THURROCK COUNCIL.xlsx" + sheet_name = "Assets" + postcode_column = 'Postcode' + fulladdress_column = "Full Address" + address1_column = None + address1_method = "house_number_extraction" + address_cols_to_concat = [] missing_postcodes_method = None - landlord_year_built = "BUILD DATE" + landlord_year_built = "Construction Date" landlord_os_uprn = None - landlord_property_type = "PROPERTY TYPE" - landlord_built_form = "PROPERTY TYPE" - landlord_wall_construction = "CONSTRUCTION TYPE" + landlord_property_type = "Property Type" + landlord_built_form = "Property Subtype" + landlord_wall_construction = None + landlord_roof_construction = None + landlord_heating_system = "Main Heating Type" + landlord_existing_pv = None + landlord_property_id = "Property Reference" + landlord_sap = None + outcomes_filename = [] + outcomes_sheetname = [] + outcomes_postcode = [] + outcomes_houseno = [] + outcomes_id = [] + outcomes_address = [] + master_filepaths = [] + master_to_asset_list_filepath = None + phase = False + ecosurv_landlords = None + + # Medway + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Medway" + data_filename = "MEDWAY Asset List.xlsx" + sheet_name = "Asset list" + postcode_column = 'Postcode' + fulladdress_column = None + address1_column = "House Number" + address1_method = None + address_cols_to_concat = ["House Number", "Street 1"] + missing_postcodes_method = None + landlord_year_built = "Year Built" + landlord_os_uprn = None + landlord_property_type = "Property Type - Academy" + landlord_built_form = "Property Type - Academy" + landlord_wall_construction = None + landlord_roof_construction = None + landlord_heating_system = None + landlord_existing_pv = None + landlord_property_id = "Row ID" + landlord_sap = None + outcomes_filename = [] + outcomes_sheetname = [] + outcomes_postcode = [] + outcomes_houseno = [] + outcomes_id = [] + outcomes_address = [] + master_filepaths = [] + master_to_asset_list_filepath = None + phase = False + ecosurv_landlords = None + + # MHS + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MHS" + data_filename = "MHS HOMES (Full Asset List) - for programme build.xlsx" + sheet_name = "Sheet1" + postcode_column = 'Postcode' + fulladdress_column = "FullAddress" + address1_column = None + address1_method = "house_number_extraction" + address_cols_to_concat = [] + missing_postcodes_method = None + landlord_year_built = "BuiltInYear" + landlord_os_uprn = None + landlord_property_type = "AssetType" + landlord_built_form = "PropertyType" + landlord_wall_construction = None landlord_roof_construction = None landlord_heating_system = None landlord_existing_pv = None @@ -94,459 +155,33 @@ def app(): phase = False ecosurv_landlords = None - # Unitas - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Unitas" - data_filename = "unitas_asset_list_for_analysis.xlsx" - sheet_name = "Sheet1" - postcode_column = 'Post Code' - fulladdress_column = "Address Line 1" - address1_column = "Address Line 1" - address1_method = None - address_cols_to_concat = [] - missing_postcodes_method = None - landlord_year_built = "built year" - landlord_os_uprn = None - landlord_property_type = "Property Type" - landlord_built_form = "Expanded Property Type" - landlord_wall_construction = None - landlord_roof_construction = "loft insulation" - landlord_heating_system = "Bolier Make" - landlord_existing_pv = None - landlord_property_id = "Property Reference" - landlord_sap = "Sap Rating" - outcomes_filename = [ - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Unitas/Unitas - All outcomes - 24.04.2025.xlsx", - ] - outcomes_sheetname = ["Feedback"] - outcomes_postcode = ["Postcode"] - outcomes_houseno = ["No."] - outcomes_id = [None] - outcomes_address = ["Address"] - master_filepaths = [ - os.path.join(data_folder, "Submissions ECO 3.csv"), - os.path.join(data_folder, "Submissions ECO 4 - PHASE 1.csv"), - os.path.join(data_folder, "Submissions ECO 4 - PHASE 2.csv") - ] - master_to_asset_list_filepath = None - phase = False - ecosurv_landlords = "unitas|everill|baskeyfield" - - # LHP: - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/LHP" - data_filename = "LHP.xlsx" - sheet_name = "Decent Homes Stock" - postcode_column = 'Postcode' - fulladdress_column = "Address" - address1_column = None - address1_method = "house_number_extraction" - address_cols_to_concat = [] - missing_postcodes_method = None - landlord_year_built = "Build Date" - landlord_os_uprn = None - landlord_property_type = "Property Type" - landlord_built_form = None - landlord_wall_construction = None - landlord_roof_construction = None - landlord_heating_system = "Heating Type" - landlord_existing_pv = None - landlord_property_id = "Property ID" - landlord_sap = None - outcomes_filename = [ - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/LHP/LHP Outcomes.xlsx", - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/LHP/Lincolnshire Housing Partnership - Outcomes 20th " - "Feb 2024.xlsx", - ] - outcomes_sheetname = ["Sheet1", "LHP"] - outcomes_postcode = ["Postcode", "Postcode"] - outcomes_houseno = ["No.", "No."] - outcomes_id = [None, None] - outcomes_address = ["Address", "Address"] - master_filepaths = [os.path.join(data_folder, "LHP Rolling Master for analysis.csv")] - master_to_asset_list_filepath = None - phase = False - ecosurv_landlords = "lhp" - - # Soverign - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Sovereign" - data_filename = "Warmfront - Quote for CWI.xlsx" - sheet_name = "Sheet2" - postcode_column = 'Postcode' - fulladdress_column = None - address1_column = "Address Line 1" - address1_method = None - address_cols_to_concat = ["Address Line 1", "Address Line 2", "Address Line 3"] - missing_postcodes_method = None - landlord_year_built = None - landlord_os_uprn = None - landlord_property_type = None - landlord_built_form = None - landlord_wall_construction = None - landlord_roof_construction = None - landlord_heating_system = None - landlord_existing_pv = None - landlord_property_id = "ID" - landlord_sap = None - outcomes_filename = None - outcomes_sheetname = None - outcomes_postcode = None - outcomes_houseno = None - outcomes_id = None - outcomes_address = None - master_filepaths = [] - master_to_asset_list_filepath = None - phase = False - ecosurv_landlords = None - - # NCHA - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/NCHA" - data_filename = "Energy Info Copy.xlsx" - sheet_name = "Data" - postcode_column = 'Postcode' - fulladdress_column = "Address" - address1_column = None - address1_method = "house_number_extraction" - address_cols_to_concat = [] - missing_postcodes_method = None - landlord_year_built = "Build Date (HAR10)" - landlord_os_uprn = None - landlord_property_type = "Property Type (HAR10)" - landlord_built_form = "Build Form (EPC)" - landlord_wall_construction = "Wall Description" - landlord_roof_construction = None - landlord_heating_system = "Heating System" - landlord_existing_pv = None - landlord_property_id = "Place ref" - landlord_sap = "EPC SAP" - outcomes_filename = None - outcomes_sheetname = None - outcomes_postcode = None - outcomes_houseno = None - outcomes_id = None - outcomes_address = None - master_filepaths = [] - master_to_asset_list_filepath = None - phase = False - ecosurv_landlords = None - - # Torus - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Torus/Phase 1" - data_filename = "Torus Property Asset List - Phase 1.xlsx" - sheet_name = "TORUS" - postcode_column = 'Postcode' - fulladdress_column = None - address1_column = "AddressLine1" - address1_method = None - address_cols_to_concat = ["AddressLine1", "AddressLine2", "AddressLine3"] - missing_postcodes_method = None - landlord_year_built = "Property Age" - landlord_os_uprn = "NatUPRN" - landlord_property_type = "Property Type" - landlord_built_form = "Built Form" - landlord_wall_construction = "Wall Construction" - landlord_roof_construction = "Roof Construction" - landlord_heating_system = "Space Heating Source" - landlord_existing_pv = "Low Carbon Technology (Solar PV)" - landlord_property_id = "UPRN" - landlord_sap = "SAP Score" - outcomes_filename = None - outcomes_sheetname = None - outcomes_postcode = None - outcomes_houseno = None - outcomes_id = None - outcomes_address = None - master_filepaths = [] - master_to_asset_list_filepath = None - phase = True - # Southern Midlands - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Southern/Midlands Properties - Apr 2025" - data_filename = "Southern Housing Midlands Property List - combined.xlsx" - sheet_name = "Sheet 1" - postcode_column = 'Post Code' - fulladdress_column = "Address" - address1_column = None - address1_method = "house_number_extraction" - address_cols_to_concat = [] - missing_postcodes_method = None - landlord_year_built = "Age_1" - landlord_os_uprn = None - landlord_property_type = "Prop_Type" - landlord_built_form = "Prop_Type" - landlord_wall_construction = "Walls_P" - landlord_heating_system = "Heating System" - landlord_existing_pv = None - landlord_property_id = "AssetID" - outcomes_filename = None - outcomes_sheetname = None - outcomes_postcode = None - outcomes_houseno = None - outcomes_id = None - outcomes_address = None - master_filepaths = [] - master_to_asset_list_filepath = None - - # PFP London - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Places For People/London" - data_filename = "PFP AREAS SURROUNDING LONDON - JAY, RUTH & LANE.xlsx" - sheet_name = "PFP SURROUNDING LONDON" - postcode_column = 'Postcode' - fulladdress_column = None - address1_column = "AddressLine1" - address1_method = None - address_cols_to_concat = ["AddressLine1", "AddressLine2", "AddressLine3"] - missing_postcodes_method = None - landlord_year_built = None - landlord_os_uprn = None - landlord_property_type = "Archetype (PFP)" - landlord_built_form = "Archetype (PFP)" - landlord_wall_construction = None - landlord_heating_system = None - landlord_existing_pv = None - landlord_property_id = "Uprn" - outcomes_filename = None - outcomes_sheetname = None - outcomes_postcode = None - outcomes_houseno = None - outcomes_id = None - master_filepaths = [] - master_to_asset_list_filepath = None - - # PFP North-West - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Places For People/North-West" - data_filename = "Places for People NORTH WEST - INSPECTIONS MASTER - UPDATE.xlsx" - sheet_name = "CHECKED" - postcode_column = 'Postcode' - fulladdress_column = None - address1_column = "AddressLine1" - address1_method = None - address_cols_to_concat = ["AddressLine1", "AddressLine2", "AddressLine3"] - missing_postcodes_method = None - landlord_year_built = None - landlord_os_uprn = None - landlord_property_type = "Archetype (PFP)" - landlord_built_form = "Archetype (PFP)" - landlord_wall_construction = None - landlord_heating_system = None - landlord_existing_pv = None - landlord_property_id = "Uprn" - outcomes_filename = None - outcomes_sheetname = None - outcomes_postcode = None - outcomes_houseno = None - outcomes_id = None - master_filepaths = [] - master_to_asset_list_filepath = None - - # PFP North-East - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Places For People/North-East" - data_filename = "Places for People NORTH EAST - INSPECTIONS MASTER.xlsx" - sheet_name = "CHECKED" - postcode_column = 'Postcode' - fulladdress_column = None - address1_column = "AddressLine1" - address1_method = None - address_cols_to_concat = ["AddressLine1", "AddressLine2", "AddressLine3"] - missing_postcodes_method = None - landlord_year_built = None - landlord_os_uprn = None - landlord_property_type = "Archetype (PFP)" - landlord_built_form = "Archetype (PFP)" - landlord_wall_construction = None - landlord_heating_system = None - landlord_existing_pv = None - landlord_property_id = "Uprn" - outcomes_filename = None - outcomes_sheetname = None - outcomes_postcode = None - outcomes_houseno = None - outcomes_id = None - master_filepaths = [] - master_to_asset_list_filepath = None - - # PFP East - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Places For People/East" - data_filename = "PFP EAST - Master - DN LN NG NR PE POSTCODES.xlsx" - sheet_name = "PFP EAST" - postcode_column = 'Postcode' - fulladdress_column = None - address1_column = "AddressLine1" - address1_method = None - address_cols_to_concat = ["AddressLine1", "AddressLine2", "AddressLine3"] - missing_postcodes_method = None - landlord_year_built = None - landlord_os_uprn = None - landlord_property_type = "Archetype (PFP)" - landlord_built_form = "Archetype (PFP)" - landlord_wall_construction = None - landlord_heating_system = None - landlord_existing_pv = None - landlord_property_id = "Uprn" - outcomes_filename = None - outcomes_sheetname = None - outcomes_postcode = None - outcomes_houseno = None - outcomes_id = None - master_filepaths = [] - master_to_asset_list_filepath = None - - # Wates - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Wates - " - data_filename = "ECO 4 Wates.xlsx" - sheet_name = "Roadmap Homes" - postcode_column = 'Postcode' - fulladdress_column = None - address1_column = "Address Line 1" - address1_method = None - address_cols_to_concat = ["Address Line 1", "Address Line 2", "Address Line 3"] - missing_postcodes_method = None - landlord_year_built = "Build Year" - landlord_os_uprn = None - landlord_property_type = "Archetype" - landlord_built_form = "Archetype" - landlord_wall_construction = "Wall" - landlord_heating_system = "Heating Type" - landlord_existing_pv = None - landlord_property_id = "UPRN" - outcomes_filename = None - outcomes_sheetname = None - outcomes_postcode = None - outcomes_houseno = None - master_filepaths = [] - master_to_asset_list_filepath = None - - # Ealing - # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Ealing/Programme data - 04032025" - # data_filename = "Ealing BC - Property Plus Tenure 25.02.2025.xlsx" - # sheet_name = "IGNORE - FULL MAIN" - # postcode_column = 'Postcode' + # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Southern/Midlands Properties - Apr 2025" + # data_filename = "Southern Housing Midlands Property List - combined.xlsx" + # sheet_name = "Sheet 1" + # postcode_column = 'Post Code' # fulladdress_column = "Address" # address1_column = None - # address1_method = "first_word" + # address1_method = "house_number_extraction" # address_cols_to_concat = [] # missing_postcodes_method = None - # landlord_year_built = "Year Built" + # landlord_year_built = "Age_1" # landlord_os_uprn = None - # landlord_property_type = "Property Type Code" - # landlord_wall_construction = None - # landlord_heating_system = None + # landlord_property_type = "Prop_Type" + # landlord_built_form = "Prop_Type" + # landlord_wall_construction = "Walls_P" + # landlord_heating_system = "Heating System" # landlord_existing_pv = None - # landlord_property_id = "Property ref" - - # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" - # data_filename = "Warmfront data- Colchester Borough Homes (Complete).xlsx" - # sheet_name = "Sheet1" - # postcode_column = 'Full Address.1' - # fulladdress_column = "Full Address" - # address1_column = None - # address1_method = "first_word" - # address_cols_to_concat = [] - # missing_postcodes_method = None - # landlord_year_built = "Build Date" - # landlord_os_uprn = None - # landlord_property_type = "Property Type" - # landlord_wall_construction = "Wallinsul" - # landlord_heating_system = "HeatSorc" - # landlord_existing_pv = None - # landlord_property_id = "Property Reference" + # landlord_property_id = "AssetID" # outcomes_filename = None # outcomes_sheetname = None # outcomes_postcode = None # outcomes_houseno = None + # outcomes_id = None + # outcomes_address = None # master_filepaths = [] # master_to_asset_list_filepath = None - # For Westward - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward" - data_filename = "WESTWARD - completed list - 20.03.2025.xlsx" - sheet_name = "Sheet1" - postcode_column = "WFT EDIT Postcode" - fulladdress_column = "Address" - address1_column = None - address1_method = "house_number_extraction" - address_cols_to_concat = [] - missing_postcodes_method = None - landlord_year_built = "Build date" - landlord_os_uprn = "UPRN" - landlord_property_type = "Location type" - landlord_built_form = None - landlord_wall_construction = "Wall Construction (EPC)" - landlord_heating_system = "Heat Source" - landlord_existing_pv = "PV (Y/N)" - landlord_property_id = "Place ref" - landlord_roof_construction = None - landlord_sap = None - outcomes_filename = None - outcomes_sheetname = None - outcomes_postcode = None - outcomes_houseno = None - master_filepaths = [] - master_to_asset_list_filepath = None - outcomes_id = None - outcomes_address = None - phase = False - ecosurv_landlords = None - - # For ACIS - programme re-build - # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/ACIS/ACIS Full Programme Review March 2025" - # data_filename = "ACIS asset list.xlsx" - # sheet_name = "Assets" - # address1_column = "House No" - # postcode_column = "Postcode" - # landlord_property_id = "UPRN" - # fulladdress_column = None - # address_cols_to_concat = ["House No", "Street", "Town"] - # missing_postcodes_method = None - # address1_method = None - # landlord_year_built = "YEAR BUILT" - # landlord_os_uprn = None - # landlord_property_type = "Property type" - # landlord_built_form = None - # landlord_wall_construction = "Wall Constuction" - # landlord_roof_construction = None - # landlord_sap = None - # landlord_heating_system = "Heating" - # landlord_existing_pv = None - # outcomes_filename = "ACIS Group - 25.11.2024 - outcomes.xlsx" - # outcomes_sheetname = "Feedback" - # outcomes_postcode = "Postcode" - # outcomes_address = "Address" - # outcomes_houseno = "No" - # outcomes_id = None - # master_filepaths = [ - # os.path.join(data_folder, "ECO 3 -Table 1.csv"), - # os.path.join(data_folder, "ECO 4 -Table 1.csv"), - # ] - # master_to_asset_list_filepath = None - # phase = False - # ecosurv_landlords = None - - # For plus dane - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Plus Dane" - data_filename = "PLUS DANE Asset List - for analysis.xlsx" - sheet_name = "Asset List" - address1_column = " Address" - postcode_column = " Postcode" - landlord_property_id = "UPRN" - fulladdress_column = " Address" - address_cols_to_concat = [] - missing_postcodes_method = None - address1_method = None - landlord_year_built = "Property Age" - landlord_os_uprn = None - landlord_property_type = "Property Type" - landlord_wall_construction = "Landlord Wall Full" - landlord_heating_system = "Landlord Heating" - landlord_existing_pv = None - outcomes_filename = "plus dane outcomes.xlsx" - outcomes_sheetname = "EVERYTHING" - outcomes_postcode = "Post Code" - outcomes_houseno = "Numb." - master_filepaths = [ - os.path.join(data_folder, "JJC Rolling Master.csv"), - os.path.join(data_folder, "SCIS Rolling Master.csv"), - ] - master_to_asset_list_filepath = os.path.join(data_folder, "surveys_to_assets.csv") - # Maps addresses to uprn in problematic cases manual_uprn_map = {} diff --git a/asset_list/mappings/built_form.py b/asset_list/mappings/built_form.py index ffd698b3..116c3203 100644 --- a/asset_list/mappings/built_form.py +++ b/asset_list/mappings/built_form.py @@ -5,7 +5,7 @@ STANDARD_BUILT_FORMS = { # Houses "end-terrace", "semi-detached", "detached", "mid-terrace", # Flats - "ground floor", "mid-floor", "top-floor", "basement" + "ground floor", "mid-floor", "top-floor", "basement", "low rise", "high rise", } BUILT_FORM_MAPPINGS = { @@ -265,5 +265,70 @@ BUILT_FORM_MAPPINGS = { 'FIRST FLOOR FLAT': 'ground floor', 'GROUND FL MAISONETTE': 'ground floor', 'HOUSE 2 LIVING ROOMS': 'unknown', - 'FLAT OVER SHOP': 'unknown' + 'FLAT OVER SHOP': 'unknown', + + '4 Bed Detached House': 'detached', + '2 Bed Detached House': 'detached', + '3 Bed Detached Bungalow': 'detached', + '1 Bed Semi-Detached House': 'semi-detached', + '2 Bed Semi-Detached House': 'semi-detached', + '2 Bed Detached Bungalow': 'detached', + '1 Bed Mid Terrace Bungalow': 'mid-terrace', + '4 Bed Semi-Detached Bungalow': 'semi-detached', + '3 Bed Mid Terrace Bungalow': 'mid-terrace', + '3 Bed Semi-Detached Bungalow': 'semi-detached', + '3 Bed Mid Terrace House': 'mid-terrace', + '2 Bed Mid Terrace House': 'mid-terrace', + '3 Bed Detached House': 'detached', + '2 Bed Semi-Detached Bungalow': 'semi-detached', + '5 Bed Mid Terrace House': 'mid-terrace', + '2 Bed Mid Terrace Bungalow': 'mid-terrace', + '3 Bed Semi-Detached House': 'semi-detached', + '1 Bed Semi-Detached Bungalow': 'semi-detached', + '4 Bed Mid Terrace House': 'mid-terrace', + '1 Bed Detached Bungalow': 'detached', + '5 Bed Semi-Detached House': 'semi-detached', + '6 Bed Detached House': 'detached', + '1 Bed Mid Terrace House': 'mid-terrace', + '4 Bed Semi-Detached House': 'semi-detached', + 'TBA': 'unknown', + '1 Bed EOT House': 'end-terrace', + '3 Bed Flat': 'unknown', + '5 Bed EOT House': 'end-terrace', + '1 Bed EOT Bungalow': 'end-terrace', + '2 Bed EOT House': 'end-terrace', + '1 Bed Studio Flat': 'unknown', + '3 Bed Maison': 'unknown', + 'Commercial Letting': 'unknown', + '4 Bed Maison': 'unknown', + '2 Bed Flat': 'unknown', + '3 Bed EOT House': 'end-terrace', + '2 Bed Maison': 'unknown', + '4 Bed EOT House': 'end-terrace', + '1 Bed Flat': 'unknown', + '3 Bed EOT Bungalow': 'end-terrace', + '1 Bed Maison': 'unknown', + '2 Bed EOT Bungalow': 'end-terrace', + + 'Bungalow detached': 'detached', + 'Bungalow semi detached': 'semi-detached', + 'Sheltered bungalow semi detached': 'semi-detached', + 'Bedsit bungalow semi detached': 'semi-detached', + 'Semi detached house': 'semi-detached', + 'Bedsit bungalow terraced': 'mid-terrace', 'Terraced house': 'mid-terrace', + 'Sheltered flat': 'unknown', + 'APD Bungalow': 'unknown', + 'Flat with partition': 'unknown', + 'APD flat': 'unknown', + 'Sheltered warden flat': 'unknown', + 'Sheltered bedsit': 'unknown', + 'Sheltered bungalow terraced': 'mid-terrace', + 'Block': 'unknown', + 'Bungalow terraced': 'mid-terrace', + 'Maisonette flat': 'unknown', + 'Sheltered bedsit disabled': 'unknown', + 'Bedsit Flat': 'unknown', + 'Low Rise': 'low rise', + 'Upper Floor': 'top-floor', + 'High Rise': 'high rise', } diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py index 463e2cef..92f59f2c 100644 --- a/asset_list/mappings/heating_systems.py +++ b/asset_list/mappings/heating_systems.py @@ -288,5 +288,8 @@ HEATING_MAPPINGS = { 'No Gas Boiler': 'no heating', 'Back Boiler': 'solid fuel', "This cell has an external reference that can't be shown or edited. Editing this cell will remove the external " - "reference.": 'unknown' + "reference.": 'unknown', + 'Communal Heating': 'communal heating', + 'No Data': 'unknown', + 'Boiler System': 'gas condensing boiler', } diff --git a/asset_list/mappings/property_type.py b/asset_list/mappings/property_type.py index d455d312..b705d6ef 100644 --- a/asset_list/mappings/property_type.py +++ b/asset_list/mappings/property_type.py @@ -227,6 +227,30 @@ PROPERTY_MAPPING = { 'FIRST FLOOR FLAT': 'flat', 'GROUND FL MAISONETTE': 'maisonette', 'HOUSE 2 LIVING ROOMS': 'house', - 'FLAT OVER SHOP': 'flat' - + 'FLAT OVER SHOP': 'flat', + 'House With Integral Garage': 'house', + 'Flat Over Parking/Accessway': 'flat', + 'Flat Over Binstore': 'flat', + 'Flat Over Garage': 'flat', + 'House With Independent Garage': 'house', + 'Studio': 'flat', + 'Bedsit bungalow terraced': 'bedsit', + 'Terraced house': 'house', + 'Sheltered flat': 'flat', + 'APD Bungalow': 'bungalow', + 'Flat with partition': 'flat', + 'Bungalow detached': 'bungalow', + 'APD flat': 'flat', + 'Sheltered warden flat': 'flat', + 'Bungalow semi detached': 'bungalow', + 'Sheltered bedsit': 'bedsit', + 'Sheltered bungalow terraced': 'bungalow', + 'Sheltered bungalow semi detached': 'bungalow', + 'Bungalow terraced': 'bungalow', + 'Maisonette flat': 'maisonette', + 'Sheltered bedsit disabled': 'bedsit', + 'Bedsit bungalow semi detached': 'bedsit', + 'Bedsit Flat': 'bedsit', + 'Semi detached house': 'house', + 'Unit': 'unknown' } diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index 96b7c5de..e19a776d 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -172,7 +172,7 @@ class SearchEpc: self.address1 = address1 self.postcode = postcode - self.full_address = full_address + self.full_address = full_address if full_address is not None else self.address1 self.uprn = uprn self.house_number = self.get_house_number(self.address1) self.numeric_house_number = self.extract_numeric_housenumber_part(self.house_number) @@ -265,9 +265,7 @@ class SearchEpc: for retry in range(self.max_retries): try: - response = self.client.domestic.call(method="get", url=url, params=params) - if response: self.data = response return { @@ -368,8 +366,11 @@ class SearchEpc: unique_property_types = {r["property-type"] for r in rows} # We allow for variation in property type across flats/maisonettes - if (len(uprns) == 1) and ((len(unique_property_types) == 1) or unique_property_types == {"Flat", "Maisonette"}): - return rows + # If we know that we have a flat/maisonette, we allow for both property types + if property_type in ["Flat", "Maisonette"]: + if ((len(uprns) == 1) and ((len(unique_property_types) == 1) + ) or unique_property_types == {"Flat", "Maisonette"}): + return rows if property_type is not None: # We can do a filter on the property type @@ -388,11 +389,27 @@ class SearchEpc: # We check if post town is included in the address if any([r["posttown"].lower() in address.lower() for r in rows]): - best_match = process.extractOne( + best_match1 = process.extractOne( address, [", ".join([r["address"], r["posttown"]]) for r in rows], score_cutoff=0 ) - # Get all of the scores - rows_filtered = [r for r in rows if ", ".join([r["address"], r["posttown"]]) == best_match[0]] + best_match2 = process.extractOne( + address, [", ".join([r["address"]]) for r in rows], score_cutoff=0 + ) + # Pick the largest score + if best_match1[1] >= best_match2[1]: + # Get all of the scores + rows_filtered = [r for r in rows if ", ".join([r["address"], r["posttown"]]) == best_match1[0]] + else: + # Get all of the scores + rows_filtered = [r for r in rows if r["address"] == best_match2[0]] + + # If we have multiple, we filter on newest lodgment date + if len(rows_filtered) > 1: + rows_filtered = [ + r for r in rows_filtered + if r["lodgement-datetime"] == max([x["lodgement-datetime"] for x in rows_filtered]) + ] + else: best_match = process.extractOne(address, [r["address"] for r in rows], score_cutoff=0) # Get the UPRN for the best match diff --git a/etl/customers/mhs/flag_pilot.py b/etl/customers/mhs/flag_pilot.py new file mode 100644 index 00000000..f96f965d --- /dev/null +++ b/etl/customers/mhs/flag_pilot.py @@ -0,0 +1,134 @@ +""" +On the standardised asset list, this script will flag the pilot assets. +""" +import pandas as pd +import os +import numpy as np +from tqdm import tqdm + +PILOT_PROJECT_CODE = "MHS-000-PILOT" +MHS_PHASE_1_PROJECT_CODE = "MHS-001" + +asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MHS/MHS HOMES (Full Asset List) - for programme build - " + "Standardised.xlsx", + sheet_name="Standardised Asset List", +) +flat_data = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MHS/MHS HOMES (Full Asset List) - for programme build - " + "Standardised.xlsx", + sheet_name="Flat Data", +) + +pilot = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MHS/MHS 334 x Pilot reviewed - KB notes end column.xlsx" +) +ciga_checks = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MHS/MHS CIGA Check 03042025_201.xlsx" +) +ciga_checks["row_id"] = ciga_checks.index + +asset_list["project_code"] = None + +asset_list["project_code"] = np.where( + asset_list["landlord_property_id"].isin(pilot["Place Reference"]), + PILOT_PROJECT_CODE, + asset_list["project_code"], +) +# We now flag the next phase of the programme +asset_list["project_code"] = np.where( + (~pd.isnull(asset_list["cavity_reason"]) | ~pd.isnull(asset_list["solar_reason"])) & pd.isnull( + asset_list["project_code"]), + MHS_PHASE_1_PROJECT_CODE, + asset_list["project_code"], +) + +# We now flag the CIGA checks +manual_fixes = { + "123 Columbine Close, Rochester": "2213861230" +} +ciga_lookup = [] +for _, row in tqdm(ciga_checks.iterrows(), total=len(ciga_checks)): + + if manual_fixes.get(row["Matched Address"]): + ll_pid = manual_fixes[row["Matched Address"]] + df = asset_list[ + (asset_list["landlord_property_id"].astype(str) == ll_pid) + ] + ciga_lookup.append( + { + "domna_property_id": df["domna_property_id"].values[0], + "row_id": row["row_id"], + } + ) + continue + + df = asset_list[ + (asset_list["domna_postcode"] == row["Postcode"]) + ] + + df = df[ + (df["domna_address_1"].astype(str) == str(row["Address1"])) + ] + + if df.empty: + df = asset_list[ + (asset_list["domna_postcode"] == row["Matched Postcode"]) + ] + df = df[(df["domna_address_1"].astype(str) == str(row["Address1"]))] + + if df.shape[0] > 1: + df = asset_list[ + (asset_list["domna_full_address"].str.lower().str.replace(",", "").str.contains( + row["Matched Address"].lower().replace(",", ""), na=False)) + ] + if df.empty: + df = asset_list[ + (asset_list["domna_full_address"].str.lower().str.replace(",", "").str.contains( + row["Address2"].lower().replace(",", ""), na=False)) + ] + + df = df[(df["domna_address_1"].astype(str) == str(row["Address1"]))] + + if df.shape[0] != 1: + raise Exception("More than one match found for {row['Address1']} in the asset list") + + ciga_lookup.append( + { + "domna_property_id": df["domna_property_id"].values[0], + "row_id": row["row_id"], + } + ) + +ciga_lookup = pd.DataFrame(ciga_lookup) + +ciga_lookup = ciga_lookup.merge( + ciga_checks[["row_id", "Guarantee"]].rename( + columns={"Guarantee": "ciga_guarantee"} + ), how="left", on="row_id" +) +ciga_lookup["ciga_check_complete"] = True + +asset_list = asset_list.merge( + ciga_lookup[["domna_property_id", "ciga_guarantee"]], + how="left", + on="domna_property_id" +) + +# Check we matched addresses correctly +# match_check = ciga_lookup.merge( +# ciga_checks, how="left", on="row_id" +# ).merge( +# asset_list[["domna_property_id", "domna_full_address"]], how="left", on="domna_property_id" +# ) +# match_check = match_check[["Matched Address", "domna_full_address"]] + +# Save + +filename = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/MHS/12052025 MHS Standardised Asset List - " + "programme.xlsx") +# Store the data in two tabs. One for the asset list with the EPC data and the second with the flat data + +with pd.ExcelWriter(filename) as writer: + asset_list.to_excel(writer, sheet_name="Standardised Asset List", index=False) + flat_data.to_excel(writer, sheet_name="Flat Data", index=False)