mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
working on fixing missed matched in eco3 matching
This commit is contained in:
parent
ef77db1037
commit
022244377d
1 changed files with 66 additions and 18 deletions
|
|
@ -171,6 +171,10 @@ class DataLoader:
|
|||
"HA107": 51,
|
||||
}
|
||||
|
||||
UNMATCHED_ECO3 = {
|
||||
"HA25": 94
|
||||
}
|
||||
|
||||
def __init__(self, directories, december_figures_filepath, use_cache, rebuild):
|
||||
self.directories = directories
|
||||
self.use_cache = use_cache
|
||||
|
|
@ -1458,9 +1462,6 @@ class DataLoader:
|
|||
|
||||
def merge_eco3_to_assets(self, asset_list, eco3_list, ha_name):
|
||||
|
||||
# We add on a matching postcode without spaces for this
|
||||
# asset_list["matching_postcode_no_space"] = asset_list["matching_postcode"].str.lower().str.replace(" ", "")
|
||||
|
||||
# May need an eco3 list correction function
|
||||
|
||||
# NEADS DRIVE, postcode with bs305dt, is not found in the asset list
|
||||
|
|
@ -1471,8 +1472,17 @@ class DataLoader:
|
|||
eco3_list = eco3_list[
|
||||
~pd.isnull(eco3_list["Post Code"])
|
||||
]
|
||||
# We have a bunch of genuine duplicates
|
||||
eco3_list = eco3_list.drop_duplicates(["NO ", "Street / Block Name", "Post Code"])
|
||||
|
||||
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
|
||||
"HALWILL MEADOOW", "HALWILL MEADOW"
|
||||
)
|
||||
|
||||
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
|
||||
"Hall Road", "Hall Rd"
|
||||
)
|
||||
|
||||
missed_postcodes = []
|
||||
if ha_name == "HA25":
|
||||
missed_postcodes = {
|
||||
postcode.lower() for postcode in eco3_list["Post Code"] if
|
||||
|
|
@ -1480,10 +1490,18 @@ class DataLoader:
|
|||
}
|
||||
eco3_list = eco3_list[~eco3_list["Post Code"].str.lower().isin(missed_postcodes)]
|
||||
|
||||
# For the asset list, we create a matching address without any punctuation
|
||||
# TODO: We should generally just remove puncutation from addresses when matching
|
||||
asset_list['matching_address_no_punctuation'] = asset_list['matching_address'].str.replace(r'[^\w\s]', '',
|
||||
regex=True)
|
||||
# Remove double spaces
|
||||
asset_list["matching_address_no_punctuation"] = asset_list["matching_address_no_punctuation"].str.replace(
|
||||
" ", " "
|
||||
)
|
||||
|
||||
matching_lookup = []
|
||||
missed = []
|
||||
for _, row in tqdm(eco3_list.iterrows(), total=len(eco3_list)):
|
||||
|
||||
postcode = row["Post Code"].lower().strip()
|
||||
|
||||
# df will never be empty, since we've already done a check for common postcodes
|
||||
|
|
@ -1507,24 +1525,20 @@ class DataLoader:
|
|||
if " " in str(house_number):
|
||||
house_number = house_number.split(" ")[0].strip()
|
||||
|
||||
df = df[df["matching_address"].str.contains(str(house_number))]
|
||||
# We must do the house number filter
|
||||
df = df[df["HouseNo"].astype(str).str.lower() == str(house_number)]
|
||||
|
||||
# Perform a search on streetname
|
||||
# We do this to prevent duplicate matches to properties with the same postcode and house number,
|
||||
# but different streets
|
||||
street_name_section1 = row["Street / Block Name"].lower().split("/")[0].split(",")[0]
|
||||
street_name_section1 = re.sub(r'[^\w\s]', '', street_name_section1)
|
||||
df = df[df["matching_address_no_punctuation"].str.contains(street_name_section1)]
|
||||
|
||||
if df.empty:
|
||||
missed.append(row["eco3_list_row_id"])
|
||||
continue
|
||||
|
||||
if df.shape[0] != 1:
|
||||
df = df[df["HouseNo"].astype(str).str.lower() == str(house_number)]
|
||||
|
||||
if df.empty:
|
||||
missed.append(row["eco3_list_row_id"])
|
||||
continue
|
||||
|
||||
if df.shape[0] != 1:
|
||||
# Perform a search on streetname
|
||||
street_name_section1 = row["Street / Block Name"].lower().split("/")[0].split(",")[0]
|
||||
df = df[df["matching_address"].str.contains(street_name_section1)]
|
||||
|
||||
if df.shape[0] != 1:
|
||||
print(row["Street / Block Name"])
|
||||
print(house_number)
|
||||
|
|
@ -1538,6 +1552,40 @@ class DataLoader:
|
|||
}
|
||||
)
|
||||
|
||||
# We verify the missed
|
||||
# -HA25 contains 88 missed entries. These are actually 8 unique postcodes, where surveys were conducted
|
||||
# on properties that had house numbers outside of the asset list
|
||||
if len(missed) != self.UNMATCHED_ECO3[ha_name]:
|
||||
raise ValueError(
|
||||
f"Unmatched addresses for {ha_name} is not as expected, got {len(missed)} unmatched"
|
||||
)
|
||||
|
||||
# TODO: 194 missed
|
||||
|
||||
matching_lookup = pd.DataFrame(matching_lookup)
|
||||
# Check dupes as this will cause problems later on
|
||||
if matching_lookup["asset_list_row_id"].duplicated().any():
|
||||
raise ValueError("Duplicated asset list row ids")
|
||||
|
||||
missed_df = eco3_list[eco3_list["eco3_list_row_id"].isin(missed)]
|
||||
missed_df.head(3).tail(1)["eco3_list_row_id"]
|
||||
|
||||
duped_ids = matching_lookup[matching_lookup["asset_list_row_id"].duplicated()]["asset_list_row_id"].tolist()
|
||||
duped_df = matching_lookup[
|
||||
matching_lookup["asset_list_row_id"].isin(duped_ids)
|
||||
]
|
||||
duped_surveys = eco3_list[
|
||||
eco3_list["eco3_list_row_id"].isin(duped_df["eco3_list_row_id"].values)
|
||||
].copy()
|
||||
|
||||
duped_surveys = duped_surveys.merge(matching_lookup, how="left", on="eco3_list_row_id")
|
||||
|
||||
duped_surveys[
|
||||
["NO ", "Street / Block Name", "Post Code", "eco3_list_row_id", "asset_list_row_id"]
|
||||
].sort_values("asset_list_row_id").head()
|
||||
|
||||
asset_list[asset_list["asset_list_row_id"] == "HA2515145"]["matching_address"].values
|
||||
|
||||
@staticmethod
|
||||
def extract_streetname(address, house_number=None, postcode=None):
|
||||
"""
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue