completed creationg of matching tables

This commit is contained in:
Khalim Conn-Kowlessar 2024-02-23 15:54:28 +00:00
parent 5a451f2f82
commit 75183902c1

View file

@ -43,7 +43,8 @@ class DataLoader:
# the asset list
"HA14": 4,
# There's just too many unmatched here - if we identify some homes that
"HA6": 117
"HA6": 117,
"HA107": 52
}
def __init__(self, directories, use_cache):
@ -130,7 +131,7 @@ class DataLoader:
:return:
"""
if ha_name in ["HA6", "HA14"]:
if ha_name in ["HA6", "HA14", "HA107"]:
split_addresses = ciga_list['Matched Address'].str.split(',', expand=True)
house_numbers = split_addresses[0].str.split(' ', expand=True)
# THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how
@ -153,8 +154,11 @@ class DataLoader:
@staticmethod
def get_ciga_sheetname(workbook):
if "CIGA Checks" in workbook.sheetnames:
return "CIGA Checks"
elif "CIGA checks" in workbook.sheetnames:
return "CIGA checks"
else:
return "CIGA"
@ -490,6 +494,22 @@ class DataLoader:
return survey_list
@staticmethod
def levenstein_match(matching_string, df):
match_to = df["matching_address"].tolist()
# Strip out punctuation and spaces
match_to = [re.sub(r'[^\w\s]', '', x) for x in match_to]
match_to = [x.replace(" ", "") for x in match_to]
# Perform matching between full key and match_to
distances = [Levenshtein.distance(matching_string, s) for s in match_to]
best_match_index = distances.index(min(distances))
# We might want to consider a threshold for the distance, however for the momeny,
# we don't consider this for the moment
df = df.iloc[best_match_index:best_match_index + 1]
return df
def merge_surveys_to_assets(self, asset_list, survey_list, ha_name):
# Correct the survey list
@ -544,17 +564,7 @@ class DataLoader:
# Remove any spaces from the full key
full_key = full_key.replace(" ", "")
match_to = df["matching_address"].tolist()
# Strip out punctuation and spaces
match_to = [re.sub(r'[^\w\s]', '', x) for x in match_to]
match_to = [x.replace(" ", "") for x in match_to]
# Perform matching between full key and match_to
distances = [Levenshtein.distance(full_key, s) for s in match_to]
best_match_index = distances.index(min(distances))
# We might want to consider a threshold for the distance, however for the momeny,
# we don't consider this for the moment
df = df.iloc[best_match_index:best_match_index + 1]
df = self.levenstein_match(full_key, df)
if df.shape[0] != 1:
print(row["Street / Block Name"])
@ -623,7 +633,7 @@ class DataLoader:
asset_list["matching_address"].str.contains(row["Matched Postcode"].lower().strip())
].copy()
df = df[df["HouseNo"] == str(house_number)]
df = df[df["HouseNo"].astype(str) == str(house_number)]
# For ciga, we skip
if df.empty:
unmatched_addresses.append(
@ -641,7 +651,9 @@ class DataLoader:
street_name = self.extract_streetname(
address=row["Matched Address"], house_number=house_number, postcode=row["Matched Postcode"]
)
df = df[df["matching_address"].str.replace(",", "").str.contains(street_name)]
# We check if any of the rows contains the street name and if they do, filter
if any(df["matching_address"].str.replace(",", "").str.contains(street_name)):
df = df[df["matching_address"].str.replace(",", "").str.contains(street_name)]
if df.shape[0] != 1:
# The final check we do here is to check for the presence of flat in the address
@ -650,6 +662,13 @@ class DataLoader:
else:
df = df[df["matching_address"].str.contains("flat") == False]
if df.shape[0] != 1:
full_key = str(row["HouseNo"]).lower().strip() + row["Matched Address"].lower().strip() + row[
"Matched Postcode"].lower().strip()
# Remove any spaces from the full key
full_key = full_key.replace(" ", "")
df = self.levenstein_match(full_key, df)
if df.shape[0] != 1:
print(row["Street / Block Name"])
print(house_number)
@ -737,6 +756,19 @@ class DataLoader:
s3_file_name="ha-analysis/batch3-inputs.pickle",
)
def ha_facts_and_figures(self):
"""
This function will return a dictionary of facts and figures for each HA
:return:
"""
ha_facts_and_figures = []
for ha_name, data_assets in self.data.items():
asset_list = data_assets["asset_list"]
survey_list = data_assets["survey_list"]
ciga_list = data_assets["ciga_list"]
return ha_facts_and_figures
def get_epc_data(
loader, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds, pull_data=True
@ -1511,6 +1543,7 @@ def app():
loader = DataLoader(directories, use_cache)
loader.load()
loader.ha_facts_and_figures()
# TODO: We probably need to make sure that we have all of the columns that we need