ordance survey logic basically finsihed

This commit is contained in:
Jun-te Kim 2026-03-04 16:58:23 +00:00
parent db251c1857
commit 1b3a942c30
2 changed files with 74 additions and 28 deletions

View file

@ -129,7 +129,9 @@ def get_uprn_candidates(
user_norm = addressMatch.normalise_address(user_address)
out["lexiscore"] = out[address_column].apply(lambda x: addressMatch.levenshtein(user_norm, x))
out["lexiscore"] = out[address_column].apply(
lambda x: addressMatch.levenshtein(user_norm, x)
)
# Normalise UPRN to string
out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True)
@ -346,7 +348,7 @@ def handler(event, context, local=False):
{
"task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917",
"sub_task_id": "6a427b6e-1ece-4983-b1e5-9bffccc53d1d",
"s3_uri": "s3://retrofit-data-dev/ara_postcode_splitter_batches/e31f2f21-175b-4a91-a3ec-a6baa325e917/8673913b-1a88-42d7-8578-0449123d94b0/2026-02-16T12:00:20.257856_7b520c0e.csv",
"s3_uri": "s3://retrofit-data-dev/ara_postcode_splitter_batches/e31f2f21-175b-4a91-a3ec-a6baa325e917/8673913b-1a88-42d7-8578-0449123d94b0/2026-02-18T11:47:00.822579_f95467f5.csv",
}
)
}
@ -507,9 +509,9 @@ def handler(event, context, local=False):
results_data.append(
{
**row, # Include all original data
"uprn": uprn,
"domna_found_address": found_address,
"domna_lexiscore": score,
"address2uprn_uprn": uprn,
"address2uprn_address": found_address,
"address2uprn_lexiscore": score,
}
)
else:
@ -519,9 +521,9 @@ def handler(event, context, local=False):
results_data.append(
{
**row, # Include all original data
"uprn": None,
"domna_found_address": None,
"domna_lexiscore": None,
"address2uprn_uprn": None,
"address2uprn_address": None,
"address2uprn_lexiscore": None,
}
)
@ -533,9 +535,9 @@ def handler(event, context, local=False):
results_data.append(
{
**row,
"uprn": None,
"domna_found_address": None,
"domna_lexiscore": None,
"address2uprn_uprn": None,
"address2uprn_address": None,
"address2uprn_lexiscore": None,
"error": str(e),
}
)

View file

@ -44,10 +44,7 @@ def check_if_post_code_exists_in_db_cache(postcode):
if response.get("status") != 200 or "data" not in response:
logger.error(f"OS Places API failed for {postcode}: {response}")
raise RuntimeError(
"A postcode that doesn't exists in ordant survey and check if its real in postcode validator!!! Postcode: {postcode}"
)
return None
return pd.DataFrame()
# Save to cache
new_record = PostcodeSearchModel(
@ -77,7 +74,7 @@ def handler(body: dict[str, Any], context: Any, local: bool = False) -> None:
body = {
"task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917",
"sub_task_id": "8673913b-1a88-42d7-8578-0449123d94b0",
"s3_uri": "s3://retrofit-data-dev/ara_raw_outputs/e31f2f21-175b-4a91-a3ec-a6baa325e917/09cc7368-0850-4145-8b04-ebd84b3263c4/2026-02-18T14:00:13.228611_d2f675c3.csv",
"s3_uri": "s3://retrofit-data-dev/ara_raw_outputs/e31f2f21-175b-4a91-a3ec-a6baa325e917/6a427b6e-1ece-4983-b1e5-9bffccc53d1d/2026-03-04T16:48:22.339995_634c88fc.csv",
}
s3_uri: str = body.get("s3_uri", "")
@ -91,25 +88,72 @@ def handler(body: dict[str, Any], context: Any, local: bool = False) -> None:
# Assumption designing with address2uprn was ran first
csv_data = read_csv_from_s3_dict(bucket, key)
df = pd.DataFrame(csv_data)
df["domna_lexiscore"] = pd.to_numeric(df["domna_lexiscore"], errors="coerce")
df["address2uprn_lexiscore"] = pd.to_numeric(
df["address2uprn_lexiscore"], errors="coerce"
)
needs_processing = df[
df["domna_lexiscore"].isna() | (df["domna_lexiscore"] < lexiscore_threshold)
df["address2uprn_lexiscore"].isna()
| (df["address2uprn_lexiscore"] < lexiscore_threshold)
]
grouped = needs_processing.groupby("postcode_clean")
# Initialise new columns
df["ordnance_survey_address"] = None
df["ordnance_survey_uprn"] = None
df["ordnance_survey_lexiscore"] = None
# Process each postcode group at a time
for postcode, group in grouped:
print(f"Processing postcode: {postcode} ({len(group)} rows)")
valid_group = addressMatch.is_valid_postcode(postcode)
if valid_group:
postcode_cache = None
if postcode_cache is None:
postcode_cache = get_ordance_survey_record(postcode)
for index, row in group.iterrows():
print("do something")
break
if not valid_group:
logger.warning(f"Postcode {postcode} is invalid, skipping")
for idx in group.index:
df.at[idx, "ordnance_survey_address"] = (
"postcode not found in ordnance survey"
)
df.at[idx, "ordnance_survey_uprn"] = (
"postcode not found in ordnance survey"
)
df.at[idx, "ordnance_survey_lexiscore"] = (
"postcode not found in ordnance survey"
)
continue
# Add business logic to do handling
# TODO: Copy and do ordant survey logic
# TODO: Save new results to s3 ( ask Khalim if we want to save to db)
postcode_cache = check_if_post_code_exists_in_db_cache(postcode)
if postcode_cache.empty:
logger.warning(f"No OS Places data for {postcode}")
for idx in group.index:
df.at[idx, "ordnance_survey_address"] = (
"postcode not found in ordnance survey"
)
df.at[idx, "ordnance_survey_uprn"] = (
"postcode not found in ordnance survey"
)
df.at[idx, "ordnance_survey_lexiscore"] = (
"postcode not found in ordnance survey"
)
continue
for idx, row in group.iterrows():
user_address = str(row.get("user_input", "")).strip()
if not user_address:
continue
# Score against OS Places addresses
scores = postcode_cache["ADDRESS"].apply(
lambda addr: addressMatch.score(user_address, addr)
)
best_idx = scores.idxmax()
best_score = scores[best_idx]
df.at[idx, "ordnance_survey_address"] = postcode_cache.at[
best_idx, "ADDRESS"
]
df.at[idx, "ordnance_survey_uprn"] = postcode_cache.at[best_idx, "UPRN"]
df.at[idx, "ordnance_survey_lexiscore"] = best_score
# TODO: Save new results to s3 (ask Khalim if we want to save to db)
df.to_csv("ordnance_survey_results.csv", index=False)
print(f"Results saved to ordnance_survey_results.csv ({len(df)} rows)")