mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
completed creationg of matching tables
This commit is contained in:
parent
5a451f2f82
commit
75183902c1
1 changed files with 48 additions and 15 deletions
|
|
@ -43,7 +43,8 @@ class DataLoader:
|
|||
# the asset list
|
||||
"HA14": 4,
|
||||
# There's just too many unmatched here - if we identify some homes that
|
||||
"HA6": 117
|
||||
"HA6": 117,
|
||||
"HA107": 52
|
||||
}
|
||||
|
||||
def __init__(self, directories, use_cache):
|
||||
|
|
@ -130,7 +131,7 @@ class DataLoader:
|
|||
:return:
|
||||
"""
|
||||
|
||||
if ha_name in ["HA6", "HA14"]:
|
||||
if ha_name in ["HA6", "HA14", "HA107"]:
|
||||
split_addresses = ciga_list['Matched Address'].str.split(',', expand=True)
|
||||
house_numbers = split_addresses[0].str.split(' ', expand=True)
|
||||
# THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how
|
||||
|
|
@ -153,8 +154,11 @@ class DataLoader:
|
|||
|
||||
@staticmethod
|
||||
def get_ciga_sheetname(workbook):
|
||||
|
||||
if "CIGA Checks" in workbook.sheetnames:
|
||||
return "CIGA Checks"
|
||||
elif "CIGA checks" in workbook.sheetnames:
|
||||
return "CIGA checks"
|
||||
else:
|
||||
return "CIGA"
|
||||
|
||||
|
|
@ -490,6 +494,22 @@ class DataLoader:
|
|||
|
||||
return survey_list
|
||||
|
||||
@staticmethod
|
||||
def levenstein_match(matching_string, df):
|
||||
match_to = df["matching_address"].tolist()
|
||||
# Strip out punctuation and spaces
|
||||
match_to = [re.sub(r'[^\w\s]', '', x) for x in match_to]
|
||||
match_to = [x.replace(" ", "") for x in match_to]
|
||||
|
||||
# Perform matching between full key and match_to
|
||||
distances = [Levenshtein.distance(matching_string, s) for s in match_to]
|
||||
best_match_index = distances.index(min(distances))
|
||||
# We might want to consider a threshold for the distance, however for the momeny,
|
||||
# we don't consider this for the moment
|
||||
df = df.iloc[best_match_index:best_match_index + 1]
|
||||
|
||||
return df
|
||||
|
||||
def merge_surveys_to_assets(self, asset_list, survey_list, ha_name):
|
||||
|
||||
# Correct the survey list
|
||||
|
|
@ -544,17 +564,7 @@ class DataLoader:
|
|||
# Remove any spaces from the full key
|
||||
full_key = full_key.replace(" ", "")
|
||||
|
||||
match_to = df["matching_address"].tolist()
|
||||
# Strip out punctuation and spaces
|
||||
match_to = [re.sub(r'[^\w\s]', '', x) for x in match_to]
|
||||
match_to = [x.replace(" ", "") for x in match_to]
|
||||
|
||||
# Perform matching between full key and match_to
|
||||
distances = [Levenshtein.distance(full_key, s) for s in match_to]
|
||||
best_match_index = distances.index(min(distances))
|
||||
# We might want to consider a threshold for the distance, however for the momeny,
|
||||
# we don't consider this for the moment
|
||||
df = df.iloc[best_match_index:best_match_index + 1]
|
||||
df = self.levenstein_match(full_key, df)
|
||||
|
||||
if df.shape[0] != 1:
|
||||
print(row["Street / Block Name"])
|
||||
|
|
@ -623,7 +633,7 @@ class DataLoader:
|
|||
asset_list["matching_address"].str.contains(row["Matched Postcode"].lower().strip())
|
||||
].copy()
|
||||
|
||||
df = df[df["HouseNo"] == str(house_number)]
|
||||
df = df[df["HouseNo"].astype(str) == str(house_number)]
|
||||
# For ciga, we skip
|
||||
if df.empty:
|
||||
unmatched_addresses.append(
|
||||
|
|
@ -641,7 +651,9 @@ class DataLoader:
|
|||
street_name = self.extract_streetname(
|
||||
address=row["Matched Address"], house_number=house_number, postcode=row["Matched Postcode"]
|
||||
)
|
||||
df = df[df["matching_address"].str.replace(",", "").str.contains(street_name)]
|
||||
# We check if any of the rows contains the street name and if they do, filter
|
||||
if any(df["matching_address"].str.replace(",", "").str.contains(street_name)):
|
||||
df = df[df["matching_address"].str.replace(",", "").str.contains(street_name)]
|
||||
|
||||
if df.shape[0] != 1:
|
||||
# The final check we do here is to check for the presence of flat in the address
|
||||
|
|
@ -650,6 +662,13 @@ class DataLoader:
|
|||
else:
|
||||
df = df[df["matching_address"].str.contains("flat") == False]
|
||||
|
||||
if df.shape[0] != 1:
|
||||
full_key = str(row["HouseNo"]).lower().strip() + row["Matched Address"].lower().strip() + row[
|
||||
"Matched Postcode"].lower().strip()
|
||||
# Remove any spaces from the full key
|
||||
full_key = full_key.replace(" ", "")
|
||||
df = self.levenstein_match(full_key, df)
|
||||
|
||||
if df.shape[0] != 1:
|
||||
print(row["Street / Block Name"])
|
||||
print(house_number)
|
||||
|
|
@ -737,6 +756,19 @@ class DataLoader:
|
|||
s3_file_name="ha-analysis/batch3-inputs.pickle",
|
||||
)
|
||||
|
||||
def ha_facts_and_figures(self):
|
||||
"""
|
||||
This function will return a dictionary of facts and figures for each HA
|
||||
:return:
|
||||
"""
|
||||
ha_facts_and_figures = []
|
||||
for ha_name, data_assets in self.data.items():
|
||||
asset_list = data_assets["asset_list"]
|
||||
survey_list = data_assets["survey_list"]
|
||||
ciga_list = data_assets["ciga_list"]
|
||||
|
||||
return ha_facts_and_figures
|
||||
|
||||
|
||||
def get_epc_data(
|
||||
loader, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds, pull_data=True
|
||||
|
|
@ -1511,6 +1543,7 @@ def app():
|
|||
|
||||
loader = DataLoader(directories, use_cache)
|
||||
loader.load()
|
||||
loader.ha_facts_and_figures()
|
||||
|
||||
# TODO: We probably need to make sure that we have all of the columns that we need
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue