From 5848cb5314a991bf5cb2a48b0e0d2c915dc01446 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 7 May 2025 17:43:14 +0100 Subject: [PATCH] working on unitas data standardisation --- asset_list/AssetList.py | 77 +++++++++++++++++--------- asset_list/app.py | 62 ++++++++++++--------- asset_list/mappings/built_form.py | 40 ++++++++++++- asset_list/mappings/heating_systems.py | 15 ++++- asset_list/mappings/property_type.py | 15 ++++- asset_list/mappings/roof.py | 16 +++++- 6 files changed, 168 insertions(+), 57 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 4586ae57..dcc7290a 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -5,6 +5,7 @@ import tiktoken from pprint import pprint from datetime import datetime +from numpy.ma.core import masked_not_equal from openai import OpenAI import numpy as np import pandas as pd @@ -2179,7 +2180,7 @@ class AssetList: return # TODO: Fetch from Sharepoint - ecosurv_filepath = "/Users/khalimconn-kowlessar/Documents/hestia/Ecosurv/15.04.csv" + ecosurv_filepath = "/Users/khalimconn-kowlessar/Documents/hestia/Ecosurv/07.05.2025.csv" logger.info("Getting Ecosurv data from %s", ecosurv_filepath) self.ecosurv = pd.read_csv(ecosurv_filepath, encoding="cp437") @@ -2310,7 +2311,7 @@ class AssetList: nomatch_i = [] for _, x in tqdm(outcomes.iterrows(), total=len(outcomes)): - if pd.isnull(x[outcomes_address[idx]]): + if pd.isnull(x[outcomes_address[idx]]) or not x[outcomes_address[idx]]: continue # Check if we have an id @@ -2448,7 +2449,7 @@ class AssetList: return pd.to_datetime(match.group(1), format="%d.%m.%Y", errors="coerce") return pd.NaT - lookup['parsed_date'] = lookup['Date letters sent'].apply(extract_date) + lookup['parsed_date'] = lookup[date_col].apply(extract_date) def get_latest_note(group): surveyed = group[group['Outcome'] == 'surveyed'] @@ -2457,8 +2458,11 @@ class AssetList: else: return group.sort_values('parsed_date', ascending=False).iloc[0] - latest_note = lookup.groupby('domna_property_id', group_keys=False).apply(get_latest_note).reset_index( - drop=True) + latest_note = ( + lookup.groupby('domna_property_id', group_keys=False). + apply(get_latest_note). + reset_index(drop=True) + ) latest_note = latest_note[["domna_property_id", notes_col]] pivot_df = lookup.groupby(["domna_property_id", "Outcome"]).size().unstack(fill_value=0).reset_index() @@ -2513,36 +2517,43 @@ class AssetList: # Strip columns master_data.columns = [c.strip() for c in master_data.columns] master_data.columns = [re.sub(r'\s+', ' ', c) for c in master_data.columns] + # Drop any unnamed columns + unnamed_columns = [c for c in master_data.columns if "Unnamed:" in c] + master_data = master_data.drop(columns=unnamed_columns) if not id_map.empty: master_data = master_data.merge( id_map, how="left", on=['NO.', 'Street / Block Name', 'Post Code'] ) - install_col = ( - "INSTALLED OR CANCELLED" if "INSTALLED OR CANCELLED" in master_data.columns - else "INSTALL / CANCELLATION DATE" - ) + if "INSTALLED OR CANCELLED" in master_data.columns: + install_col = "INSTALLED OR CANCELLED" + elif "INSTALL / CANCELLATION DATE" in master_data.columns: + install_col = "INSTALL / CANCELLATION DATE" + elif 'INSTALL/ CANCELLATION DATE' in master_data.columns: + install_col = 'INSTALL/ CANCELLATION DATE' + else: + raise ValueError("No install or cancellation date") submission_col = ( "SUBMISSION DATE" if "SUBMISSION DATE" in master_data.columns else "SUBMISSION DATE TO INSTALLERS" ) - if "UPRN" in master_data.columns: - # We just need to check if any were cancelled - master_to_append = master_data[ - ["UPRN", install_col, submission_col] - ].rename( - columns={ - "UPRN": self.STANDARD_LANDLORD_PROPERTY_ID, - install_col: "survey_status", - submission_col: "submission_date" - } - ) - master_to_append["cancelled"] = master_to_append["survey_status"].str.lower().str.contains("cancel") - - master_surveyed.append(master_to_append) - continue + # if "UPRN" in master_data.columns: + # # We just need to check if any were cancelled + # master_to_append = master_data[ + # ["UPRN", install_col, submission_col] + # ].rename( + # columns={ + # "UPRN": self.STANDARD_LANDLORD_PROPERTY_ID, + # install_col: "survey_status", + # submission_col: "submission_date" + # } + # ) + # master_to_append["cancelled"] = master_to_append["survey_status"].str.lower().str.contains("cancel") + # + # master_surveyed.append(master_to_append) + # continue master_data["row_id"] = master_data.index @@ -2557,23 +2568,35 @@ class AssetList: house_no_col = 'NO.' if 'NO.' in master_data.columns else "NO" # Otherwise, we need to match algorithmically + has_property_id = "UPRN" in master_data.columns logger.info("Matching master data to asset list") matched = [] unmatched = [] for _, row in tqdm(master_data.iterrows(), total=len(master_data)): + if pd.isnull(row[postcode_col]): continue + + # if has_property_id: + # submission_uprn = row["UPRN"] + # + # if not pd.isnull(submission_uprn): + # df = self.standardised_asset_list[ + # self.standardised_asset_list[self.STANDARD_LANDLORD_PROPERTY_ID] == submission_uprn + # ] + postcode_no_space = row[postcode_col].strip().replace(" ", "").lower() df = self.standardised_asset_list[ ( - self.standardised_asset_list[self.STANDARD_POSTCODE].str.strip().str.lower().str.replace(" ", - "") - == postcode_no_space + self.standardised_asset_list[self.STANDARD_POSTCODE] + .str.strip().str.lower().str.replace(" ", "") == postcode_no_space ) ] house_no = row[house_no_col] + if isinstance(house_no, float): + house_no = str(int(house_no)) if house_no in df["house_no"].values: df = df[df["house_no"] == house_no] diff --git a/asset_list/app.py b/asset_list/app.py index 14322a97..76e09295 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -62,32 +62,42 @@ def app(): Property UPRN """ - # TODO: - # For cavity work: - # - Flag any entries that have a different wall type between non-intrusive data against EPC - # - Worth double checking entries that have a difference in wall construction - # - Look at anything that is flagged as an empty cavity but the EPC data says it’s a filled cavity - # - Look at the current EPC scores - Anything that is C75 or above, especially if it’s assumed no insulation - # - By postcode, we can try and deduce if all of the addresses are a flats and then estimate if 50% of the flats - # are less than C75 - # - Flag anything pre SAP2012 - # - Flag anything over 5 years old - # - Look at year built vs age band - # - # For Solar: - # - Discount any that have solar PV - based on non-intrusives and from the inspections team - # - In the heating, discount anything that isn’t ashp, ghsp, hhrs, electric storage - possibly homes with - # electric room heaters but it might need to be an EPC E - # - Fabric - check the floor, wall and roof: - # - Filled or empty cavity is good - # - Insulated solid/timber/system built is good - # - SCIS/CEG needs solid floors - # - JJC don’t care - # - Anything with a loft 200 or below - # - Anything C75 and above won’t qualify - # - Insulated loft = 200mm - # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) - # - Or the insulation required is loft/cavity (floors should be solid) + # Unitas + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Unitas" + data_filename = "UNITAS - Asset List.xlsx" + sheet_name = "Asset List" + postcode_column = 'Post Code' + fulladdress_column = "Address Line 1" + address1_column = "Address Line 1" + address1_method = None + address_cols_to_concat = [] + missing_postcodes_method = None + landlord_year_built = "built year" + landlord_os_uprn = None + landlord_property_type = "Property Type" + landlord_built_form = "Expanded Property Type" + landlord_wall_construction = None + landlord_roof_construction = "loft insulation" + landlord_heating_system = "Bolier Make" + landlord_existing_pv = None + landlord_property_id = "Property Reference" + landlord_sap = "Sap Rating" + outcomes_filename = [ + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Unitas/Unitas - All outcomes - 24.04.2025.xlsx", + ] + outcomes_sheetname = ["Feedback"] + outcomes_postcode = ["Postcode"] + outcomes_houseno = ["No."] + outcomes_id = [None] + outcomes_address = ["Address"] + master_filepaths = [ + os.path.join(data_folder, "Submissions ECO 3.csv"), + os.path.join(data_folder, "Submissions ECO 4 - PHASE 1.csv"), + os.path.join(data_folder, "Submissions ECO 4 - PHASE 2.csv") + ] + master_to_asset_list_filepath = None + phase = False + ecosurv_landlords = "unitas|everill|baskeyfield" # LHP: data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/LHP" diff --git a/asset_list/mappings/built_form.py b/asset_list/mappings/built_form.py index 1d0aecf5..6ce31cdd 100644 --- a/asset_list/mappings/built_form.py +++ b/asset_list/mappings/built_form.py @@ -212,5 +212,43 @@ BUILT_FORM_MAPPINGS = { '5 Ext. Wall Flat': 'unknown', 'Unknown': 'unknown', 'Enclosed mid-terrace': 'mid-terrace', - 'Enclosed end-terrace': 'end-terrace' + 'Enclosed end-terrace': 'end-terrace', + + 'House GROUND FLOOR': 'ground floor', + 'Flat? GROUND FLOOR': 'ground floor', + 'House SD SEMI DETACHED': 'semi-detached', + 'House SEMI DETACHED': 'semi-detached', + 'Flat GROUND FLOOR': 'ground floor', + '': 'unknown', + 'Flat SEVENTH FLOOR': 'mid-floor', + 'House D': 'detached', + 'House ET': 'end-terrace', + 'House SD Homeless Unit': 'semi-detached', + 'House MT Homeless Unit': 'mid-terrace', + 'Bungalow ET': 'end-terrace', + 'Bungalow D': 'detached', + 'House SD': 'semi-detached', + 'Bungalow Sheltered Accomodation': 'unknown', + 'House. SD': 'semi-detached', + 'Flat FIRST FLOOR MAISONETTE': 'ground floor', + 'Bungalow SD': 'semi-detached', + 'Flat FIRST FLOOR': 'ground floor', + 'Flat Sheltered Accomodation': 'unknown', + 'Flat SIXTH FLOOR': 'mid-floor', + 'Flat EIGHTH FLOOR': 'mid-floor', + 'Flat FOURTH FLOOR': 'mid-floor', + 'Flat Homeless Unit': 'unknown', + 'Bungalow MT': 'mid-terrace', + 'Bungalow Homeless Unit': 'unknown', + 'House MT': 'mid-terrace', + 'Flat FIFTH FLOOR': 'mid-floor', + 'Flat NINTH FLOOR': 'mid-floor', + 'House SD FIRST FLOOR': 'semi-detached', + 'Bungalow Supported housing': 'unknown', + 'Flat THIRD FLOOR': 'mid-floor', + 'Flat SECOND FLOOR': 'mid-floor', + 'House Homeless Unit': 'unknown', + 'Flat ELEVENTH FLOOR': 'mid-floor', + 'Flat TENTH FLOOR': 'mid-floor', + 'House. MT': 'mid-terrace' } diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py index e255ba4d..aceecd8f 100644 --- a/asset_list/mappings/heating_systems.py +++ b/asset_list/mappings/heating_systems.py @@ -263,5 +263,18 @@ HEATING_MAPPINGS = { 'Oil Standard Boiler Heating': 'oil boiler', 'Oil Condensing Boiler Heating': 'oil boiler', 'Electric ASHP': 'air source heat pump', - 'Modern Slimline Storage Heaters': 'electric storage heaters' + 'Modern Slimline Storage Heaters': 'electric storage heaters', + # These are boiler makes from Unitas + 'UNKNOWN': 'unknown', + 'IDEAL': 'gas combi boiler', + 'VAILLANT': 'gas combi boiler', + 'THORN': 'gas combi boiler', + 'WORCESTER': 'gas combi boiler', + 'GLOW WORM': 'gas combi boiler', + 'VOKERA': 'gas combi boiler', + 'POTTERTON': 'gas combi boiler', + 'BAXI SOLO': 'gas combi boiler', + 'BAXI BERMUDA': 'gas combi boiler', + 'BAXI': 'gas combi boiler' + } diff --git a/asset_list/mappings/property_type.py b/asset_list/mappings/property_type.py index 225d1a1f..303ba0b3 100644 --- a/asset_list/mappings/property_type.py +++ b/asset_list/mappings/property_type.py @@ -206,5 +206,18 @@ PROPERTY_MAPPING = { '02 FLAT': 'flat', '04 MAISONETTE': 'maisonette', '01 HOUSE MID': 'house', - '03 BUNGALOW': 'bungalow' + '03 BUNGALOW': 'bungalow', + 'Flat?': 'flat', + 'Bungalow ET': 'bungalow', + 'House. SD': 'house', + 'Bungalow SD': 'bungalow', + 'Bungalow D': 'bungalow', + 'House D': 'house', + 'House SD': 'house', + 'House ET': 'house', + 'Bungalow MT': 'bungalow', + 'House MT': 'house', + 'House. MT': 'house', + '': 'unknown' + } diff --git a/asset_list/mappings/roof.py b/asset_list/mappings/roof.py index a95f0529..03d6f9af 100644 --- a/asset_list/mappings/roof.py +++ b/asset_list/mappings/roof.py @@ -6,6 +6,7 @@ STANDARD_ROOF_CONSTRUCTIONS = { "pitched unknown access to loft", "piched unknown insulation", "pitched insulated", + "pitched less than 100mm insulation" "another dwelling above", "flat unknown insulation", "unknown insulated", @@ -23,5 +24,18 @@ ROOF_CONSTRUCTION_MAPPINGS = { '2018 onwards': 'unknown', 'Pitched (vaulted ceiling)': 'pitched insulated', np.nan: "unknown", - None: "unknown" + None: "unknown", + 'Unknown': 'unknown', + '270mm': 'pitched insulated', + '300mm+': 'pitched insulated', + '100mm': 'pitched less than 100mm insulation', + '250mm': 'pitched insulated', + '300mm': 'pitched insulated', + 'No Loft space': 'pitched no access to loft', + '75mm': 'pitched less than 100mm insulation', + '150mm': 'pitched insulated', + 'No Loft Hatch': 'pitched unknown access to loft', + '200mm': 'pitched insulated', + '0-49mm': 'pitched less than 100mm insulation', + '50mm': 'pitched less than 100mm insulation', }