From 99a0948e2bd3ab14197821a694cbf1d2383baff3 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 24 Feb 2025 16:11:02 +0000 Subject: [PATCH] getting ready to work on the colchester data --- asset_list/AssetList.py | 82 ++++++++++++++++++++++++++++++++-------- asset_list/app.py | 83 ++++++----------------------------------- 2 files changed, 78 insertions(+), 87 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 54f6cd96..2b80287c 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -343,6 +343,7 @@ class AssetList: self.standardised_asset_list = self.raw_asset_list.copy() # Will be used to store aggregated figures against the various work types self.work_type_figures = {} + self.flat_data = None # We detect the presence of the non-intrusive columns self.non_intrusives_present = True if "CIGA Check Required" in self.raw_asset_list.columns else False @@ -649,6 +650,9 @@ class AssetList: logger.info("Applying standardisation to asset list") for variable, mapping in self.variable_mappings.items(): + self.standardised_asset_list[variable + "_original_from_landlord"] = ( + self.standardised_asset_list[variable].copy() + ) self.standardised_asset_list[variable] = self.standardised_asset_list[variable].map(mapping) if self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum(): @@ -663,6 +667,12 @@ class AssetList: # Apply renames to our standard names # Perform final variable selection and renaming: + + # We add the original columns to the keep variables + self.keep_variables += [ + k + "_original_from_landlord" for k in self.variable_mappings.keys() + ] + self.standardised_asset_list = self.standardised_asset_list[self.keep_variables].rename( columns=self.rename_map ) @@ -912,18 +922,6 @@ class AssetList: self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= self.EMPTY_CAVITY_SAP_THRESHOLD ) ) - - self.standardised_asset_list["empty_cavity"] = ( - self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] | - self.standardised_asset_list["epc_indicates_empty_cavity"] - ) - # We add a reason - self.standardised_asset_list["empty_cavity_reason"] = np.where( - self.standardised_asset_list["non_intrusive_indicates_empty_cavity"], - "Non-Intrusive Data", - "EPC Data" - ) - ###################################################### # Extraction ###################################################### @@ -933,7 +931,7 @@ class AssetList: self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = ( (self.standardised_asset_list["non-intrusives: Construction"] == "CAVITY") & (self.standardised_asset_list["non-intrusives: Insulated"].isin(["RETRO DRILLED", "FILLED AT BUILD"])) & - (~self.standardised_asset_list['non-intrusives: Material'].isin(["GREY LOOSE BEAD", "FORMALDEHYDE"]) + (~self.standardised_asset_list['non-intrusives: Material'].isin(["GREY LOOSE BEAD", "COMPACTED BEAD"]) ) & ( self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] ) @@ -996,6 +994,12 @@ class AssetList: ) ) + self.standardised_asset_list["solar_non_intrusives_walls_insulated"] = ( + self.standardised_asset_list["non-intrusives: Insulated"].isin( + ["EWI", "RETRO DRILLED", "FILLED AT BUILD"] + ) + ) + # TODO: We don't have information about the roof from this landlord # We merge on the u-value for average thermal transmittance @@ -1146,7 +1150,8 @@ class AssetList: # The walls are insulated ( self.standardised_asset_list["solar_landlord_walls_insulated"] | - self.standardised_asset_list["solar_epc_walls_insulated"] + self.standardised_asset_list["solar_epc_walls_insulated"] | + self.standardised_asset_list["solar_non_intrusives_walls_insulated"] ) & # Roof is insulated self.standardised_asset_list["solar_epc_roof_insulated"] & @@ -1165,7 +1170,8 @@ class AssetList: # The walls are insulated ( self.standardised_asset_list["solar_landlord_walls_insulated"] | - self.standardised_asset_list["solar_epc_walls_insulated"] + self.standardised_asset_list["solar_epc_walls_insulated"] | + self.standardised_asset_list["solar_non_intrusives_walls_insulated"] ) & # Roof is insulated self.standardised_asset_list["solar_epc_loft_needs_topup"] & @@ -1216,6 +1222,15 @@ class AssetList: columns=["walls_u_value", "roof_u_value", "floor_u_value"] ) + # Adjust flagged extraction jobs to remove anything for solar + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = ( + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] & + ~self.standardised_asset_list["solar_eligible_solid_floor"] & + ~self.standardised_asset_list["solar_eligible_solid_floor_needs_loft"] + # ~self.standardised_asset_list["solar_eligible_other_floor"] & + # ~self.standardised_asset_list["solar_eligible_other_floor_needs_loft"] + ) + # Produce some aggregate figures self.work_type_figures = { # Empty cavity from non-intrusives @@ -1296,3 +1311,40 @@ class AssetList: "Other Floor, Insulated, Needs Loft", self.standardised_asset_list["solar_reason"] ) + + def flat_analysis(self): + + # We need to deduce the building name - we strip out the house number + + # We want to deduce if flats have 50% of the properties below C75 + # We group by postcode and property type + grouped = self.standardised_asset_list.groupby( + [self.STANDARD_POSTCODE, self.STANDARD_PROPERTY_TYPE] + ) + + flat_data = [] + for _, group in grouped: + if "flat" in group[self.STANDARD_PROPERTY_TYPE].values: + num_flats = group[self.STANDARD_PROPERTY_TYPE].shape[0] + num_below_c75 = group[ + self.EPC_API_DATA_NAMES["current-energy-efficiency"] + ].lt(self.FILLED_CAVITY_SAP_THRESHOLD).sum() + # Check if any flats are below C69 + num_flats_below_c69 = group[ + self.EPC_API_DATA_NAMES["current-energy-efficiency"] + ].lt(69).sum() + + flat_data.append( + { + "Postcode": group[self.STANDARD_POSTCODE].iloc[0], + "Property Type": "Flat", + "Number of Flats with EPC": num_flats, + "Number of Flats below C75": num_below_c75, + "Proportion of Flat EPCs below C75": round(100 * num_below_c75 / num_flats), + "Number of Flats Below C69": num_flats_below_c69, + } + ) + + flat_data = pd.DataFrame(flat_data) + + self.flat_data = flat_data diff --git a/asset_list/app.py b/asset_list/app.py index 65d4ab87..f164e94e 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -4,6 +4,7 @@ import json import pandas as pd import numpy as np from tqdm import tqdm +from pprint import pprint import msgpack from utils.s3 import read_from_s3 from asset_list.AssetList import AssetList @@ -239,23 +240,18 @@ def app(): # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) # - Or the insulation required is loft/cavity (floors should be solid) - # For Westward - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward" - DATA_FILENAME = "WESTWARD - completed list..xlsx" + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" + DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx" SHEET_NAME = "Sheet1" - - POSTCODE_COLUMN = "WFT EDIT Postcode" - FULLADDRESS_COLUMN = "Address" + POSTCODE_COLUMN = 'Full Address.1' + FULLADDRESS_COLUMN = "Full Address" ADDRESS1_COLUMN = None - ADDRESS1_METHOD = "house_number_extraction" - + ADDRESS1_METHOD = "first_word" ADDRESS_COLS_TO_CONCAT = [] MISSING_POSTCODES_METHOD = None - PROPERTY_YEAR_BUILT = "Build date" - UPRN_COLUMN = "UPRN" - # If we have the non-intrusives data, this should be true - HAS_NON_INTRUSIVES = True - PROPERTY_TYPE_COLUMN = "Location type" # This will be used to identify and remove bedsits + PROPERTY_YEAR_BUILT = "Build Date" + UPRN_COLUMN = None + PROPERTY_TYPE_COLUMN = None # Maps addresses to uprn in problematic cases MANUAL_UPRN_MAP = {} @@ -297,20 +293,6 @@ def app(): asset_list.apply_standardiation() - # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" - # DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx" - # SHEET_NAME = "Sheet1" - # POSTCODE_COLUMN = 'Full Address.1' - # FULLADDRESS_COLUMN = "Full Address" - # ADDRESS1_COLUMN = None - # ADDRESS1_METHOD = "first_word" - # ADDRESS_COLS_TO_CONCAT = [] - # MISSING_POSTCODES_METHOD = None - # PROPERTY_YEAR_BUILT = "Build Date" - # UPRN_COLUMN = None - # # If we have the non-intrusives data, this should be true - # HAS_NON_INTRUSIVES = True - ### We retrieve the EPC data # We chunk up this data into 5000 rows at a time @@ -455,48 +437,9 @@ def app(): asset_list.identify_worktypes(cleaned) - from pprint import pprint pprint(asset_list.work_type_figures) - # TODO: We should do this breakdown for flats - def flat_analysis(asset_list): - - # We need to deduce the building name - we strip out the house number - - # We want to deduce if flats have 50% of the properties below C75 - # We group by postcode and property type - grouped = asset_list.standardised_asset_list.groupby( - [asset_list.STANDARD_POSTCODE, asset_list.STANDARD_PROPERTY_TYPE] - ) - - flat_data = [] - for _, group in grouped: - if "flat" in group[asset_list.STANDARD_PROPERTY_TYPE].values: - num_flats = group[asset_list.STANDARD_PROPERTY_TYPE].shape[0] - num_below_c75 = group[ - asset_list.EPC_API_DATA_NAMES["current-energy-efficiency"] - ].lt(asset_list.FILLED_CAVITY_SAP_THRESHOLD).sum() - # Check if any flats are below C69 - num_flats_below_c69 = group[ - asset_list.EPC_API_DATA_NAMES["current-energy-efficiency"] - ].lt(69).sum() - - flat_data.append( - { - "Postcode": group[asset_list.STANDARD_POSTCODE].iloc[0], - "Property Type": "Flat", - "Number of Flats with EPC": num_flats, - "Number of Flats below C75": num_below_c75, - "Proportion of Flat EPCs below C75": round(100 * num_below_c75 / num_flats), - "num_flats_below_c69": num_flats_below_c69, - } - ) - - flat_data = pd.DataFrame(flat_data) - - return flat_data - - flat_data = flat_analysis(asset_list) + asset_list.flat_analysis() # Store as an excel filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " - Standardised.xlsx" @@ -504,8 +447,4 @@ def app(): with pd.ExcelWriter(filename) as writer: asset_list.standardised_asset_list.to_excel(writer, sheet_name="Standardised Asset List", index=False) - flat_data.to_excel(writer, sheet_name="Flat Data", index=False) - - matches_review = asset_list[ - [FULLADDRESS_COLUMN, ADDRESS1_COLUMN, POSTCODE_COLUMN, "Address on EPC", "Postcode on EPC"] - ] + asset_list.flat_data.to_excel(writer, sheet_name="Flat Data", index=False)