created block splitting code for calico asset list

This commit is contained in:
Khalim Conn-Kowlessar 2025-06-19 08:15:01 +01:00
parent c22179f1a5
commit 383a4852e2

View file

@ -1025,6 +1025,15 @@ class AssetList:
self.standardised_asset_list[self.STANDARD_SAP]
)
has_blocks_of_flats = (self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] == "block of flats").sum()
# Perform block splitting, ahead of fetching the EPC data
# If we blocks of flats, without a landlord block reference, we create this
self.fill_landlord_block_reference(has_blocks_of_flats)
# If we have blocks of flats, we split these out into individual units.
self.split_blocks()
def merge_data(self, df: pd.DataFrame):
"""
Used to insert data into the standardised asset list, based on the domna property id
@ -1270,6 +1279,12 @@ class AssetList:
)
)
self.standardised_asset_list["SAP Category"] = np.where(
pd.isnull(self.standardised_asset_list[self.STANDARD_SAP]),
"SAP Unknown",
self.standardised_asset_list["SAP Category"]
)
else:
# We add a SAP category for all work type identification
# We break into 4 categories (54 or less, 55-68, 69-74, 75 or more)
@ -1290,6 +1305,11 @@ class AssetList:
),
)
)
self.standardised_asset_list["SAP Category"] = np.where(
pd.isnull(self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]]),
"SAP Unknown",
self.standardised_asset_list["SAP Category"]
)
# Before we being, we identify if a property has solar already as we use this
# for identifying cavity jobs
@ -2040,6 +2060,100 @@ class AssetList:
self.standardised_asset_list[self.STANDARD_BLOCK_REFERENCE]
)
def split_blocks(self):
"""
Where we have a single row that is a block of flats, we split this into multiple rows,
one for each unit. The data that we have will be copied across rows
:return:
"""
blocks = self.standardised_asset_list[
self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] == "block of flats"
].copy()
if blocks.empty:
return
RANGE_RE = re.compile(r'\b(\d+[A-Za-z]?)\s*[-]\s*(\d+[A-Za-z]?)\b')
NUM_RE = re.compile(r'\b\d+[A-Za-z]?\b') # captures 12, 12A, etc.
expanded_rows = []
for _, row in blocks.iterrows():
addr = str(row[self.STANDARD_ADDRESS_1])
# 1 ─ Range (e.g. 1-7)
m_range = RANGE_RE.search(addr)
if m_range:
start, end = m_range.groups()
start, end = int(re.match(r'\d+', start)[0]), int(re.match(r'\d+', end)[0])
if start > end or (end - start) > 100:
raise ValueError(f"Suspicious range '{addr}'")
for n in range(start, end + 1):
new = row.copy()
new_addr = RANGE_RE.sub(str(n), addr, count=1)
original_full_address = new[self.STANDARD_FULL_ADDRESS]
new_full_address = original_full_address.replace(addr, new_addr)
new[self.STANDARD_ADDRESS_1] = new_addr
new[self.STANDARD_FULL_ADDRESS] = new_full_address
new[self.STANDARD_PROPERTY_TYPE] = "flat"
# Keep a record of the previous address 1
new["block_address1"] = addr
new["block_full_address"] = original_full_address
new["is_expended_block"] = True
# We update the full address
new[self.DOMNA_PROPERTY_ID] = f"{row[self.DOMNA_PROPERTY_ID]}-{new_addr}"
expanded_rows.append(new)
continue
# 2 ─ Explicit list (e.g. 1, 2, 5 Block)
nums = NUM_RE.findall(addr)
if len(nums) > 1 and ',' in addr:
for n in nums:
new = row.copy()
new_addr = re.sub(NUM_RE, n, addr, count=1) # replace the first number only
new[self.STANDARD_ADDRESS_1] = new_addr
new[self.DOMNA_PROPERTY_ID] = f"{row[self.DOMNA_PROPERTY_ID]}-{new_addr}"
expanded_rows.append(new)
continue
# 3 ─ Single number or no number, treat as individual dwelling
if (len(nums) == 1) or not nums:
expanded_rows.append(row)
continue
# Anything else with digits is unrecognised
raise NotImplementedError(f"Unhandled block format: '{addr}'")
expanded_blocks = pd.DataFrame(expanded_rows)
# We drop the blocks from the standardised asset list and append on the expanded blocks
self.standardised_asset_list = self.standardised_asset_list[
self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] != "block of flats"
]
self.standardised_asset_list = pd.concat(
[self.standardised_asset_list, expanded_blocks],
ignore_index=True
)
# As a final clean up, for any blocks that are size 1, we don't includr a project code
sizes = (
expanded_blocks
.groupby(self.STANDARD_BLOCK_REFERENCE)[self.DOMNA_PROPERTY_ID]
.nunique()
.reset_index()
)
size_1 = sizes[sizes[self.DOMNA_PROPERTY_ID] <= 1]
# Remove the size 1 blocks from the standardised asset list
self.standardised_asset_list[self.STANDARD_BLOCK_REFERENCE] = np.where(
self.standardised_asset_list[self.STANDARD_BLOCK_REFERENCE].isin(
size_1[self.STANDARD_BLOCK_REFERENCE].values
),
None,
self.standardised_asset_list[self.STANDARD_BLOCK_REFERENCE]
)
def label_property_status(self):
"""
This function is designed to be run after identify_worktypes() has been run, and will create a "property_status"
@ -2081,85 +2195,14 @@ class AssetList:
# These blocks may be refecence via the landlord_block_reference field, or by property types being
# blocks of flats
has_landlord_block_reference = self.landlord_block_reference is not None
has_blocks_of_flats = (self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] == "block of flats").sum()
if has_landlord_block_reference or has_blocks_of_flats:
if has_landlord_block_reference:
# If we blocks of flats, without a landlord block reference, we create this
self.fill_landlord_block_reference(has_blocks_of_flats)
self.split_blocks(has_blocks_of_flats)
def split_blocks(self, has_blocks_of_flats):
"""
Where we have a single row that is a block of flats, we split this into multiple rows,
one for each unit. The data that we have will be copied across rows
:param self:
:param has_blocks_of_flats:
:return:
"""
blocks = self.standardised_asset_list[
self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] == "block of flats"
].copy()
RANGE_RE = re.compile(r'\b(\d+[A-Za-z]?)\s*[-]\s*(\d+[A-Za-z]?)\b')
NUM_RE = re.compile(r'\b\d+[A-Za-z]?\b') # captures 12, 12A, etc.
expanded_rows = []
for _, row in blocks.iterrows():
addr = str(row[self.STANDARD_ADDRESS_1])
# 1 ─ Range (e.g. 1-7)
m_range = RANGE_RE.search(addr)
if m_range:
start, end = m_range.groups()
start, end = int(re.match(r'\d+', start)[0]), int(re.match(r'\d+', end)[0])
if start > end or (end - start) > 100:
raise ValueError(f"Suspicious range '{addr}'")
for n in range(start, end + 1):
new = row.copy()
new_addr = RANGE_RE.sub(str(n), addr, count=1)
new[self.STANDARD_ADDRESS_1] = new_addr
new[self.DOMNA_PROPERTY_ID] = f"{row[self.DOMNA_PROPERTY_ID]}-{new_addr}"
expanded_rows.append(new)
continue
# 2 ─ Explicit list (e.g. 1, 2, 5 Block)
nums = NUM_RE.findall(addr)
if len(nums) > 1 and ',' in addr:
for n in nums:
new = row.copy()
new_addr = re.sub(NUM_RE, n, addr, count=1) # replace the first number only
new[self.STANDARD_ADDRESS_1] = new_addr
new[self.DOMNA_PROPERTY_ID] = f"{row[self.DOMNA_PROPERTY_ID]}-{new_addr}"
expanded_rows.append(new)
continue
# 3 ─ Single number → treat as individual dwelling
if len(nums) == 1:
expanded_rows.append(row)
continue
# 4 ─ No numbers → keep as-is
if not nums:
expanded_rows.append(row)
continue
# Anything else with digits is unrecognised
raise NotImplementedError(f"Unhandled block format: '{addr}'")
expanded_blocks = pd.DataFrame(expanded_rows)
# We drop the blocks from the standardised asset list and append on the expanded blocks
self.standardised_asset_list = self.standardised_asset_list[
self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] != "block of flats"
]
self.standardised_asset_list = pd.concat(
[self.standardised_asset_list, expanded_blocks],
ignore_index=True
)
# # If we blocks of flats, without a landlord block reference, we create this
# self.fill_landlord_block_reference(has_blocks_of_flats)
#
# # If we have blocks of flats, we split these out into individual units
# self.split_blocks()
# For blocks that have a 50% allocation, we create project codes
self.block_analysis()