handle HA56 dupes

This commit is contained in:
Khalim Conn-Kowlessar 2024-03-10 14:13:20 +00:00
parent 28434f43c8
commit db7b6de87b

View file

@ -189,6 +189,7 @@ class DataLoader:
"HA25": 154,
"HA41": 26,
"HA50": 5,
"HA56": 320,
"HA63": 0,
"HA117": 4
}
@ -693,6 +694,8 @@ class DataLoader:
asset_list["ECO Eligibility"]
)
return asset_list
@staticmethod
def correct_ha14_asset_list(asset_list):
@ -2040,6 +2043,14 @@ class DataLoader:
"Ls63nl", "LS6 3NL"
)
# Handle a duplicate
eco3_list = eco3_list[
~((eco3_list["Street / Block Name"] == "Mount Pleasant") &
(eco3_list["Post Code"] == "CW1 3JF") &
(eco3_list["NO "] == 5) &
(eco3_list["INSTALL/ CANCELLATION DATE"] == "CANCELLED 20.5.2022"))
]
return eco3_list
def merge_eco3_to_assets(self, asset_list, eco3_list, ha_name):
@ -2128,15 +2139,16 @@ class DataLoader:
# HA25 contains 119 missed entries. These are actually 24 unique postcodes, and the majority belong to 2
# where many surveys were conducted on house numbers, not in the asset list
# 154 missed, 2827 matched for HA 25
# For HA56, the number of missed is high at 320, however a big portion of these are due to the block being
# listed in the asset list, and individual units being in the survey list
if len(missed) != self.UNMATCHED_ECO3[ha_name]:
raise ValueError(
f"Unmatched addresses for {ha_name} is not as expected, got {len(missed)} unmatched"
)
missed_df = eco3_list[eco3_list["eco3_list_row_id"].isin(missed)]
matching_lookup = pd.DataFrame(matching_lookup)
# Check dupes as this will cause problems later on
if matching_lookup["asset_list_row_id"].duplicated().any():
if matching_lookup["asset_list_row_id"].duplicated().sum():
raise ValueError("Duplicated asset list row ids")
# Merge onto eco3 list