created block splitting code for calico asset list

2026-07-27 23:35:01 +00:00 · 2025-06-19 08:15:01 +01:00 · 2025-06-19 08:15:01 +01:00 · 383a4852e2
commit 383a4852e2
parent c22179f1a5
1 changed files with 120 additions and 77 deletions
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@ -1025,6 +1025,15 @@ class AssetList:
                self.standardised_asset_list[self.STANDARD_SAP]
            )

+        has_blocks_of_flats = (self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] == "block of flats").sum()
+
+        # Perform block splitting, ahead of fetching the EPC data
+        # If we blocks of flats, without a landlord block reference, we create this
+        self.fill_landlord_block_reference(has_blocks_of_flats)
+
+        # If we have blocks of flats, we split these out into individual units.
+        self.split_blocks()
+
    def merge_data(self, df: pd.DataFrame):
        """
        Used to insert data into the standardised asset list, based on the domna property id
@ -1270,6 +1279,12 @@ class AssetList:
                )
            )

+            self.standardised_asset_list["SAP Category"] = np.where(
+                pd.isnull(self.standardised_asset_list[self.STANDARD_SAP]),
+                "SAP Unknown",
+                self.standardised_asset_list["SAP Category"]
+            )
+
        else:
            # We add a SAP category for all work type identification
            # We break into 4 categories (54 or less, 55-68, 69-74, 75 or more)
@ -1290,6 +1305,11 @@ class AssetList:
                    ),
                )
            )
+            self.standardised_asset_list["SAP Category"] = np.where(
+                pd.isnull(self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]]),
+                "SAP Unknown",
+                self.standardised_asset_list["SAP Category"]
+            )

        # Before we being, we identify if a property has solar already as we use this
        # for identifying cavity jobs
@ -2040,6 +2060,100 @@ class AssetList:
            self.standardised_asset_list[self.STANDARD_BLOCK_REFERENCE]
        )

+    def split_blocks(self):
+        """
+        Where we have a single row that is a block of flats, we split this into multiple rows,
+        one for each unit. The data that we have will be copied across rows
+        :return:
+        """
+
+        blocks = self.standardised_asset_list[
+            self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] == "block of flats"
+            ].copy()
+
+        if blocks.empty:
+            return
+
+        RANGE_RE = re.compile(r'\b(\d+[A-Za-z]?)\s*[-–]\s*(\d+[A-Za-z]?)\b')
+        NUM_RE = re.compile(r'\b\d+[A-Za-z]?\b')  # captures 12, 12A, etc.
+
+        expanded_rows = []
+        for _, row in blocks.iterrows():
+            addr = str(row[self.STANDARD_ADDRESS_1])
+
+            # 1 ─ Range  (e.g. 1-7)
+            m_range = RANGE_RE.search(addr)
+            if m_range:
+                start, end = m_range.groups()
+                start, end = int(re.match(r'\d+', start)[0]), int(re.match(r'\d+', end)[0])
+                if start > end or (end - start) > 100:
+                    raise ValueError(f"Suspicious range '{addr}'")
+                for n in range(start, end + 1):
+                    new = row.copy()
+                    new_addr = RANGE_RE.sub(str(n), addr, count=1)
+                    original_full_address = new[self.STANDARD_FULL_ADDRESS]
+                    new_full_address = original_full_address.replace(addr, new_addr)
+                    new[self.STANDARD_ADDRESS_1] = new_addr
+                    new[self.STANDARD_FULL_ADDRESS] = new_full_address
+                    new[self.STANDARD_PROPERTY_TYPE] = "flat"
+                    # Keep a record of the previous address 1
+                    new["block_address1"] = addr
+                    new["block_full_address"] = original_full_address
+                    new["is_expended_block"] = True
+                    # We update the full address
+
+                    new[self.DOMNA_PROPERTY_ID] = f"{row[self.DOMNA_PROPERTY_ID]}-{new_addr}"
+                    expanded_rows.append(new)
+                continue
+
+            # 2 ─ Explicit list  (e.g. 1, 2, 5 Block)
+            nums = NUM_RE.findall(addr)
+            if len(nums) > 1 and ',' in addr:
+                for n in nums:
+                    new = row.copy()
+                    new_addr = re.sub(NUM_RE, n, addr, count=1)  # replace the first number only
+                    new[self.STANDARD_ADDRESS_1] = new_addr
+                    new[self.DOMNA_PROPERTY_ID] = f"{row[self.DOMNA_PROPERTY_ID]}-{new_addr}"
+                    expanded_rows.append(new)
+                continue
+
+            # 3 ─ Single number or no number, treat as individual dwelling
+            if (len(nums) == 1) or not nums:
+                expanded_rows.append(row)
+                continue
+
+            # Anything else with digits is unrecognised
+            raise NotImplementedError(f"Unhandled block format: '{addr}'")
+
+        expanded_blocks = pd.DataFrame(expanded_rows)
+
+        # We drop the blocks from the standardised asset list and append on the expanded blocks
+        self.standardised_asset_list = self.standardised_asset_list[
+            self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] != "block of flats"
+            ]
+
+        self.standardised_asset_list = pd.concat(
+            [self.standardised_asset_list, expanded_blocks],
+            ignore_index=True
+        )
+
+        # As a final clean up, for any blocks that are size 1, we don't includr a project code
+        sizes = (
+            expanded_blocks
+            .groupby(self.STANDARD_BLOCK_REFERENCE)[self.DOMNA_PROPERTY_ID]
+            .nunique()
+            .reset_index()
+        )
+        size_1 = sizes[sizes[self.DOMNA_PROPERTY_ID] <= 1]
+        # Remove the size 1 blocks from the standardised asset list
+        self.standardised_asset_list[self.STANDARD_BLOCK_REFERENCE] = np.where(
+            self.standardised_asset_list[self.STANDARD_BLOCK_REFERENCE].isin(
+                size_1[self.STANDARD_BLOCK_REFERENCE].values
+            ),
+            None,
+            self.standardised_asset_list[self.STANDARD_BLOCK_REFERENCE]
+        )
+
    def label_property_status(self):
        """
        This function is designed to be run after identify_worktypes() has been run, and will create a "property_status"
@ -2081,85 +2195,14 @@ class AssetList:
        # These blocks may be refecence via the landlord_block_reference field, or by property types being
        # blocks of flats
        has_landlord_block_reference = self.landlord_block_reference is not None
-        has_blocks_of_flats = (self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] == "block of flats").sum()

-        if has_landlord_block_reference or has_blocks_of_flats:
+        if has_landlord_block_reference:

-            # If we blocks of flats, without a landlord block reference, we create this
-            self.fill_landlord_block_reference(has_blocks_of_flats)
-
-            self.split_blocks(has_blocks_of_flats)
-
-            def split_blocks(self, has_blocks_of_flats):
-                """
-                Where we have a single row that is a block of flats, we split this into multiple rows,
-                one for each unit. The data that we have will be copied across rows
-                :param self:
-                :param has_blocks_of_flats:
-                :return:
-                """
-
-                blocks = self.standardised_asset_list[
-                    self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] == "block of flats"
-                    ].copy()
-
-                RANGE_RE = re.compile(r'\b(\d+[A-Za-z]?)\s*[-–]\s*(\d+[A-Za-z]?)\b')
-                NUM_RE = re.compile(r'\b\d+[A-Za-z]?\b')  # captures 12, 12A, etc.
-
-                expanded_rows = []
-                for _, row in blocks.iterrows():
-                    addr = str(row[self.STANDARD_ADDRESS_1])
-
-                    # 1 ─ Range  (e.g. 1-7)
-                    m_range = RANGE_RE.search(addr)
-                    if m_range:
-                        start, end = m_range.groups()
-                        start, end = int(re.match(r'\d+', start)[0]), int(re.match(r'\d+', end)[0])
-                        if start > end or (end - start) > 100:
-                            raise ValueError(f"Suspicious range '{addr}'")
-                        for n in range(start, end + 1):
-                            new = row.copy()
-                            new_addr = RANGE_RE.sub(str(n), addr, count=1)
-                            new[self.STANDARD_ADDRESS_1] = new_addr
-                            new[self.DOMNA_PROPERTY_ID] = f"{row[self.DOMNA_PROPERTY_ID]}-{new_addr}"
-                            expanded_rows.append(new)
-                        continue
-
-                    # 2 ─ Explicit list  (e.g. 1, 2, 5 Block)
-                    nums = NUM_RE.findall(addr)
-                    if len(nums) > 1 and ',' in addr:
-                        for n in nums:
-                            new = row.copy()
-                            new_addr = re.sub(NUM_RE, n, addr, count=1)  # replace the first number only
-                            new[self.STANDARD_ADDRESS_1] = new_addr
-                            new[self.DOMNA_PROPERTY_ID] = f"{row[self.DOMNA_PROPERTY_ID]}-{new_addr}"
-                            expanded_rows.append(new)
-                        continue
-
-                    # 3 ─ Single number → treat as individual dwelling
-                    if len(nums) == 1:
-                        expanded_rows.append(row)
-                        continue
-
-                    # 4 ─ No numbers → keep as-is
-                    if not nums:
-                        expanded_rows.append(row)
-                        continue
-
-                    # Anything else with digits is unrecognised
-                    raise NotImplementedError(f"Unhandled block format: '{addr}'")
-
-                expanded_blocks = pd.DataFrame(expanded_rows)
-
-                # We drop the blocks from the standardised asset list and append on the expanded blocks
-                self.standardised_asset_list = self.standardised_asset_list[
-                    self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] != "block of flats"
-                    ]
-
-                self.standardised_asset_list = pd.concat(
-                    [self.standardised_asset_list, expanded_blocks],
-                    ignore_index=True
-                )
+            # # If we blocks of flats, without a landlord block reference, we create this
+            # self.fill_landlord_block_reference(has_blocks_of_flats)
+            #
+            # # If we have blocks of flats, we split these out into individual units
+            # self.split_blocks()

            # For blocks that have a 50% allocation, we create project codes
            self.block_analysis()