mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
save working copy of postcode_splitter
This commit is contained in:
parent
e1188ebc18
commit
0254c945e8
3 changed files with 117 additions and 14 deletions
|
|
@ -42,6 +42,9 @@ def levenshtein(a: str, b: str) -> float:
|
|||
# --- hard signal: numbers ---
|
||||
nums_a = extract_numbers(a_norm)
|
||||
nums_b = extract_numbers(b_norm)
|
||||
|
||||
if nums_a and not nums_b:
|
||||
return 0.0
|
||||
|
||||
# No shared numbers at all → impossible match
|
||||
if nums_a and nums_b and nums_a.isdisjoint(nums_b):
|
||||
|
|
@ -304,6 +307,78 @@ def get_uprn(user_inputed_address: str, postcode: str):
|
|||
# Safe to return the agreed UPRN
|
||||
return top_rank_df.iloc[0]["uprn"]
|
||||
|
||||
def resolve_uprns_for_postcode_group(
|
||||
group_df: pd.DataFrame,
|
||||
epc_df: pd.DataFrame,
|
||||
address_col: str = "Address 1",
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Given:
|
||||
- group_df: rows sharing the same postcode
|
||||
- epc_df: EPC search results for that postcode
|
||||
|
||||
Returns:
|
||||
group_df + found_uprn + diagnostics
|
||||
"""
|
||||
|
||||
results = []
|
||||
|
||||
for _, row in group_df.iterrows():
|
||||
user_address = str(row[address_col]).strip()
|
||||
|
||||
scored_df = get_uprn_candidates(
|
||||
epc_df,
|
||||
user_address=user_address,
|
||||
)
|
||||
|
||||
if scored_df.empty:
|
||||
results.append({
|
||||
"found_uprn": None,
|
||||
"best_match_uprn": None,
|
||||
"best_match_address": None,
|
||||
"best_match_lexiscore": None,
|
||||
"status": "no_epc_candidates",
|
||||
})
|
||||
continue
|
||||
|
||||
best_score = scored_df.iloc[0]["lexiscore"]
|
||||
|
||||
if best_score <= 0:
|
||||
results.append({
|
||||
"found_uprn": None,
|
||||
"best_match_uprn": None,
|
||||
"best_match_address": None,
|
||||
"best_match_lexiscore": best_score,
|
||||
"status": "zero_score",
|
||||
})
|
||||
continue
|
||||
|
||||
top_rank_df = scored_df[scored_df["lexirank"] == 1]
|
||||
|
||||
if not df_has_single_uprn(top_rank_df, top_rank_df.iloc[0]["uprn"]):
|
||||
results.append({
|
||||
"found_uprn": None,
|
||||
"best_match_uprn": top_rank_df.iloc[0]["uprn"],
|
||||
"best_match_address": top_rank_df.iloc[0]["address"],
|
||||
"best_match_lexiscore": best_score,
|
||||
"status": "ambiguous",
|
||||
})
|
||||
continue
|
||||
|
||||
results.append({
|
||||
"found_uprn": str(top_rank_df.iloc[0]["uprn"]),
|
||||
"best_match_uprn": str(top_rank_df.iloc[0]["uprn"]),
|
||||
"best_match_address": top_rank_df.iloc[0]["address"],
|
||||
"best_match_lexiscore": best_score,
|
||||
"status": "matched",
|
||||
})
|
||||
|
||||
return pd.concat(
|
||||
[group_df.reset_index(drop=True), pd.DataFrame(results)],
|
||||
axis=1,
|
||||
)
|
||||
|
||||
|
||||
|
||||
def test(a,b):
|
||||
assert a == b, f"erorr: {a}{type(a)} != {b}: {type(b)}"
|
||||
|
|
@ -330,6 +405,10 @@ def run_all_test():
|
|||
test(get_uprn("5 Semley Gate", "e9 5nh"), "10008238198" )
|
||||
test(get_uprn("1, 5 Semley Gate", "e9 5nh"), False)
|
||||
test(get_uprn("1 Semley Gate", "e9 5nh"), "10008238188") # this one return "flat 1, in 1 semley gate"
|
||||
test(get_uprn("48 Oswald Street", "E5 0BT"), False) # this one return "flat 1, in 1 semley gate"
|
||||
test(get_uprn("42 Oswald Street", "E5 0BT"), False) # this one return "flat 1, in 1 semley gate"
|
||||
test(get_uprn("46 Oswald Street", "E5 0BT"), False) # this one return "flat 1, in 1 semley gate"
|
||||
get_uprn_candidates(get_epc_data_with_postcode("e5 0bt"), "48 Oswald Street")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
Binary file not shown.
|
|
@ -1,6 +1,7 @@
|
|||
import pandas as pd
|
||||
import requests
|
||||
|
||||
from backend.address2UPRN.main import resolve_uprns_for_postcode_group, get_epc_data_with_postcode
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
|
||||
|
|
@ -41,7 +42,8 @@ def is_valid_postcode(postcode_clean: str) -> bool:
|
|||
|
||||
|
||||
def main():
|
||||
df = pd.read_excel("hackney.xlsx")
|
||||
df = pd.read_excel("hackney.xlsx", sheet_name="Sustainability")
|
||||
df = df.head(500)
|
||||
|
||||
# Sanitise postcodes
|
||||
df["postcode_clean"] = df["Postcode"].apply(sanitise_postcode)
|
||||
|
|
@ -55,27 +57,49 @@ def main():
|
|||
.unique()
|
||||
)
|
||||
|
||||
# Validate each postcode once
|
||||
# Validate each postcode once, TODOadd a progress bar
|
||||
postcode_validity = {
|
||||
pc: is_valid_postcode(pc)
|
||||
for pc in unique_postcodes
|
||||
for pc in tqdm(unique_postcodes, total=len(unique_postcodes))
|
||||
}
|
||||
|
||||
# Map validity back onto dataframe
|
||||
df["postcode_valid"] = df["postcode_clean"].map(postcode_validity)
|
||||
|
||||
# Group only valid postcodes
|
||||
grouped = (
|
||||
df[df["postcode_valid"]]
|
||||
.groupby("postcode_clean")
|
||||
)
|
||||
|
||||
# Example: count addresses per postcode
|
||||
postcode_counts = grouped.size().sort_values(ascending=False)
|
||||
results = []
|
||||
|
||||
for pc in sorted(unique_postcodes):
|
||||
pc_df = df[df["postcode_clean"] == pc]
|
||||
pd_df
|
||||
for postcode, group_df in tqdm(
|
||||
df[df["postcode_valid"]].groupby("postcode_clean"),
|
||||
desc="Resolving UPRNs by postcode",
|
||||
):
|
||||
try:
|
||||
epc_df = get_epc_data_with_postcode(postcode)
|
||||
|
||||
if epc_df.empty:
|
||||
tmp = group_df.copy()
|
||||
tmp["found_uprn"] = None
|
||||
tmp["status"] = "no_epc_results"
|
||||
results.append(tmp)
|
||||
continue
|
||||
|
||||
resolved = resolve_uprns_for_postcode_group(
|
||||
group_df=group_df,
|
||||
epc_df=epc_df,
|
||||
)
|
||||
|
||||
results.append(resolved)
|
||||
|
||||
except Exception as e:
|
||||
tmp = group_df.copy()
|
||||
tmp["found_uprn"] = None
|
||||
tmp["status"] = "exception"
|
||||
tmp["error"] = str(e)
|
||||
results.append(tmp)
|
||||
|
||||
final_df = pd.concat(results, ignore_index=True)
|
||||
a = final_df[["best_match_lexiscore","Address 1", "best_match_address", "Postcode", "UPRN", "best_match_uprn"]] # add levi score to viewing
|
||||
b = final_df[final_df["best_match_lexiscore"]>0] # add levi score to viewing
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue