diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 62016239..acca0c58 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -1025,6 +1025,15 @@ class AssetList: self.standardised_asset_list[self.STANDARD_SAP] ) + has_blocks_of_flats = (self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] == "block of flats").sum() + + # Perform block splitting, ahead of fetching the EPC data + # If we blocks of flats, without a landlord block reference, we create this + self.fill_landlord_block_reference(has_blocks_of_flats) + + # If we have blocks of flats, we split these out into individual units. + self.split_blocks() + def merge_data(self, df: pd.DataFrame): """ Used to insert data into the standardised asset list, based on the domna property id @@ -1270,6 +1279,12 @@ class AssetList: ) ) + self.standardised_asset_list["SAP Category"] = np.where( + pd.isnull(self.standardised_asset_list[self.STANDARD_SAP]), + "SAP Unknown", + self.standardised_asset_list["SAP Category"] + ) + else: # We add a SAP category for all work type identification # We break into 4 categories (54 or less, 55-68, 69-74, 75 or more) @@ -1290,6 +1305,11 @@ class AssetList: ), ) ) + self.standardised_asset_list["SAP Category"] = np.where( + pd.isnull(self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]]), + "SAP Unknown", + self.standardised_asset_list["SAP Category"] + ) # Before we being, we identify if a property has solar already as we use this # for identifying cavity jobs @@ -2040,6 +2060,100 @@ class AssetList: self.standardised_asset_list[self.STANDARD_BLOCK_REFERENCE] ) + def split_blocks(self): + """ + Where we have a single row that is a block of flats, we split this into multiple rows, + one for each unit. The data that we have will be copied across rows + :return: + """ + + blocks = self.standardised_asset_list[ + self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] == "block of flats" + ].copy() + + if blocks.empty: + return + + RANGE_RE = re.compile(r'\b(\d+[A-Za-z]?)\s*[-–]\s*(\d+[A-Za-z]?)\b') + NUM_RE = re.compile(r'\b\d+[A-Za-z]?\b') # captures 12, 12A, etc. + + expanded_rows = [] + for _, row in blocks.iterrows(): + addr = str(row[self.STANDARD_ADDRESS_1]) + + # 1 ─ Range (e.g. 1-7) + m_range = RANGE_RE.search(addr) + if m_range: + start, end = m_range.groups() + start, end = int(re.match(r'\d+', start)[0]), int(re.match(r'\d+', end)[0]) + if start > end or (end - start) > 100: + raise ValueError(f"Suspicious range '{addr}'") + for n in range(start, end + 1): + new = row.copy() + new_addr = RANGE_RE.sub(str(n), addr, count=1) + original_full_address = new[self.STANDARD_FULL_ADDRESS] + new_full_address = original_full_address.replace(addr, new_addr) + new[self.STANDARD_ADDRESS_1] = new_addr + new[self.STANDARD_FULL_ADDRESS] = new_full_address + new[self.STANDARD_PROPERTY_TYPE] = "flat" + # Keep a record of the previous address 1 + new["block_address1"] = addr + new["block_full_address"] = original_full_address + new["is_expended_block"] = True + # We update the full address + + new[self.DOMNA_PROPERTY_ID] = f"{row[self.DOMNA_PROPERTY_ID]}-{new_addr}" + expanded_rows.append(new) + continue + + # 2 ─ Explicit list (e.g. 1, 2, 5 Block) + nums = NUM_RE.findall(addr) + if len(nums) > 1 and ',' in addr: + for n in nums: + new = row.copy() + new_addr = re.sub(NUM_RE, n, addr, count=1) # replace the first number only + new[self.STANDARD_ADDRESS_1] = new_addr + new[self.DOMNA_PROPERTY_ID] = f"{row[self.DOMNA_PROPERTY_ID]}-{new_addr}" + expanded_rows.append(new) + continue + + # 3 ─ Single number or no number, treat as individual dwelling + if (len(nums) == 1) or not nums: + expanded_rows.append(row) + continue + + # Anything else with digits is unrecognised + raise NotImplementedError(f"Unhandled block format: '{addr}'") + + expanded_blocks = pd.DataFrame(expanded_rows) + + # We drop the blocks from the standardised asset list and append on the expanded blocks + self.standardised_asset_list = self.standardised_asset_list[ + self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] != "block of flats" + ] + + self.standardised_asset_list = pd.concat( + [self.standardised_asset_list, expanded_blocks], + ignore_index=True + ) + + # As a final clean up, for any blocks that are size 1, we don't includr a project code + sizes = ( + expanded_blocks + .groupby(self.STANDARD_BLOCK_REFERENCE)[self.DOMNA_PROPERTY_ID] + .nunique() + .reset_index() + ) + size_1 = sizes[sizes[self.DOMNA_PROPERTY_ID] <= 1] + # Remove the size 1 blocks from the standardised asset list + self.standardised_asset_list[self.STANDARD_BLOCK_REFERENCE] = np.where( + self.standardised_asset_list[self.STANDARD_BLOCK_REFERENCE].isin( + size_1[self.STANDARD_BLOCK_REFERENCE].values + ), + None, + self.standardised_asset_list[self.STANDARD_BLOCK_REFERENCE] + ) + def label_property_status(self): """ This function is designed to be run after identify_worktypes() has been run, and will create a "property_status" @@ -2081,85 +2195,14 @@ class AssetList: # These blocks may be refecence via the landlord_block_reference field, or by property types being # blocks of flats has_landlord_block_reference = self.landlord_block_reference is not None - has_blocks_of_flats = (self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] == "block of flats").sum() - if has_landlord_block_reference or has_blocks_of_flats: + if has_landlord_block_reference: - # If we blocks of flats, without a landlord block reference, we create this - self.fill_landlord_block_reference(has_blocks_of_flats) - - self.split_blocks(has_blocks_of_flats) - - def split_blocks(self, has_blocks_of_flats): - """ - Where we have a single row that is a block of flats, we split this into multiple rows, - one for each unit. The data that we have will be copied across rows - :param self: - :param has_blocks_of_flats: - :return: - """ - - blocks = self.standardised_asset_list[ - self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] == "block of flats" - ].copy() - - RANGE_RE = re.compile(r'\b(\d+[A-Za-z]?)\s*[-–]\s*(\d+[A-Za-z]?)\b') - NUM_RE = re.compile(r'\b\d+[A-Za-z]?\b') # captures 12, 12A, etc. - - expanded_rows = [] - for _, row in blocks.iterrows(): - addr = str(row[self.STANDARD_ADDRESS_1]) - - # 1 ─ Range (e.g. 1-7) - m_range = RANGE_RE.search(addr) - if m_range: - start, end = m_range.groups() - start, end = int(re.match(r'\d+', start)[0]), int(re.match(r'\d+', end)[0]) - if start > end or (end - start) > 100: - raise ValueError(f"Suspicious range '{addr}'") - for n in range(start, end + 1): - new = row.copy() - new_addr = RANGE_RE.sub(str(n), addr, count=1) - new[self.STANDARD_ADDRESS_1] = new_addr - new[self.DOMNA_PROPERTY_ID] = f"{row[self.DOMNA_PROPERTY_ID]}-{new_addr}" - expanded_rows.append(new) - continue - - # 2 ─ Explicit list (e.g. 1, 2, 5 Block) - nums = NUM_RE.findall(addr) - if len(nums) > 1 and ',' in addr: - for n in nums: - new = row.copy() - new_addr = re.sub(NUM_RE, n, addr, count=1) # replace the first number only - new[self.STANDARD_ADDRESS_1] = new_addr - new[self.DOMNA_PROPERTY_ID] = f"{row[self.DOMNA_PROPERTY_ID]}-{new_addr}" - expanded_rows.append(new) - continue - - # 3 ─ Single number → treat as individual dwelling - if len(nums) == 1: - expanded_rows.append(row) - continue - - # 4 ─ No numbers → keep as-is - if not nums: - expanded_rows.append(row) - continue - - # Anything else with digits is unrecognised - raise NotImplementedError(f"Unhandled block format: '{addr}'") - - expanded_blocks = pd.DataFrame(expanded_rows) - - # We drop the blocks from the standardised asset list and append on the expanded blocks - self.standardised_asset_list = self.standardised_asset_list[ - self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] != "block of flats" - ] - - self.standardised_asset_list = pd.concat( - [self.standardised_asset_list, expanded_blocks], - ignore_index=True - ) + # # If we blocks of flats, without a landlord block reference, we create this + # self.fill_landlord_block_reference(has_blocks_of_flats) + # + # # If we have blocks of flats, we split these out into individual units + # self.split_blocks() # For blocks that have a 50% allocation, we create project codes self.block_analysis()