mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
ordance survey logic basically finsihed
This commit is contained in:
parent
db251c1857
commit
1b3a942c30
2 changed files with 74 additions and 28 deletions
|
|
@ -129,7 +129,9 @@ def get_uprn_candidates(
|
|||
|
||||
user_norm = addressMatch.normalise_address(user_address)
|
||||
|
||||
out["lexiscore"] = out[address_column].apply(lambda x: addressMatch.levenshtein(user_norm, x))
|
||||
out["lexiscore"] = out[address_column].apply(
|
||||
lambda x: addressMatch.levenshtein(user_norm, x)
|
||||
)
|
||||
|
||||
# Normalise UPRN to string
|
||||
out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True)
|
||||
|
|
@ -346,7 +348,7 @@ def handler(event, context, local=False):
|
|||
{
|
||||
"task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917",
|
||||
"sub_task_id": "6a427b6e-1ece-4983-b1e5-9bffccc53d1d",
|
||||
"s3_uri": "s3://retrofit-data-dev/ara_postcode_splitter_batches/e31f2f21-175b-4a91-a3ec-a6baa325e917/8673913b-1a88-42d7-8578-0449123d94b0/2026-02-16T12:00:20.257856_7b520c0e.csv",
|
||||
"s3_uri": "s3://retrofit-data-dev/ara_postcode_splitter_batches/e31f2f21-175b-4a91-a3ec-a6baa325e917/8673913b-1a88-42d7-8578-0449123d94b0/2026-02-18T11:47:00.822579_f95467f5.csv",
|
||||
}
|
||||
)
|
||||
}
|
||||
|
|
@ -507,9 +509,9 @@ def handler(event, context, local=False):
|
|||
results_data.append(
|
||||
{
|
||||
**row, # Include all original data
|
||||
"uprn": uprn,
|
||||
"domna_found_address": found_address,
|
||||
"domna_lexiscore": score,
|
||||
"address2uprn_uprn": uprn,
|
||||
"address2uprn_address": found_address,
|
||||
"address2uprn_lexiscore": score,
|
||||
}
|
||||
)
|
||||
else:
|
||||
|
|
@ -519,9 +521,9 @@ def handler(event, context, local=False):
|
|||
results_data.append(
|
||||
{
|
||||
**row, # Include all original data
|
||||
"uprn": None,
|
||||
"domna_found_address": None,
|
||||
"domna_lexiscore": None,
|
||||
"address2uprn_uprn": None,
|
||||
"address2uprn_address": None,
|
||||
"address2uprn_lexiscore": None,
|
||||
}
|
||||
)
|
||||
|
||||
|
|
@ -533,9 +535,9 @@ def handler(event, context, local=False):
|
|||
results_data.append(
|
||||
{
|
||||
**row,
|
||||
"uprn": None,
|
||||
"domna_found_address": None,
|
||||
"domna_lexiscore": None,
|
||||
"address2uprn_uprn": None,
|
||||
"address2uprn_address": None,
|
||||
"address2uprn_lexiscore": None,
|
||||
"error": str(e),
|
||||
}
|
||||
)
|
||||
|
|
|
|||
|
|
@ -44,10 +44,7 @@ def check_if_post_code_exists_in_db_cache(postcode):
|
|||
|
||||
if response.get("status") != 200 or "data" not in response:
|
||||
logger.error(f"OS Places API failed for {postcode}: {response}")
|
||||
raise RuntimeError(
|
||||
"A postcode that doesn't exists in ordant survey and check if its real in postcode validator!!! Postcode: {postcode}"
|
||||
)
|
||||
return None
|
||||
return pd.DataFrame()
|
||||
|
||||
# Save to cache
|
||||
new_record = PostcodeSearchModel(
|
||||
|
|
@ -77,7 +74,7 @@ def handler(body: dict[str, Any], context: Any, local: bool = False) -> None:
|
|||
body = {
|
||||
"task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917",
|
||||
"sub_task_id": "8673913b-1a88-42d7-8578-0449123d94b0",
|
||||
"s3_uri": "s3://retrofit-data-dev/ara_raw_outputs/e31f2f21-175b-4a91-a3ec-a6baa325e917/09cc7368-0850-4145-8b04-ebd84b3263c4/2026-02-18T14:00:13.228611_d2f675c3.csv",
|
||||
"s3_uri": "s3://retrofit-data-dev/ara_raw_outputs/e31f2f21-175b-4a91-a3ec-a6baa325e917/6a427b6e-1ece-4983-b1e5-9bffccc53d1d/2026-03-04T16:48:22.339995_634c88fc.csv",
|
||||
}
|
||||
|
||||
s3_uri: str = body.get("s3_uri", "")
|
||||
|
|
@ -91,25 +88,72 @@ def handler(body: dict[str, Any], context: Any, local: bool = False) -> None:
|
|||
# Assumption designing with address2uprn was ran first
|
||||
csv_data = read_csv_from_s3_dict(bucket, key)
|
||||
df = pd.DataFrame(csv_data)
|
||||
df["domna_lexiscore"] = pd.to_numeric(df["domna_lexiscore"], errors="coerce")
|
||||
df["address2uprn_lexiscore"] = pd.to_numeric(
|
||||
df["address2uprn_lexiscore"], errors="coerce"
|
||||
)
|
||||
needs_processing = df[
|
||||
df["domna_lexiscore"].isna() | (df["domna_lexiscore"] < lexiscore_threshold)
|
||||
df["address2uprn_lexiscore"].isna()
|
||||
| (df["address2uprn_lexiscore"] < lexiscore_threshold)
|
||||
]
|
||||
|
||||
grouped = needs_processing.groupby("postcode_clean")
|
||||
|
||||
# Initialise new columns
|
||||
df["ordnance_survey_address"] = None
|
||||
df["ordnance_survey_uprn"] = None
|
||||
df["ordnance_survey_lexiscore"] = None
|
||||
|
||||
# Process each postcode group at a time
|
||||
for postcode, group in grouped:
|
||||
print(f"Processing postcode: {postcode} ({len(group)} rows)")
|
||||
valid_group = addressMatch.is_valid_postcode(postcode)
|
||||
if valid_group:
|
||||
postcode_cache = None
|
||||
if postcode_cache is None:
|
||||
postcode_cache = get_ordance_survey_record(postcode)
|
||||
for index, row in group.iterrows():
|
||||
print("do something")
|
||||
break
|
||||
if not valid_group:
|
||||
logger.warning(f"Postcode {postcode} is invalid, skipping")
|
||||
for idx in group.index:
|
||||
df.at[idx, "ordnance_survey_address"] = (
|
||||
"postcode not found in ordnance survey"
|
||||
)
|
||||
df.at[idx, "ordnance_survey_uprn"] = (
|
||||
"postcode not found in ordnance survey"
|
||||
)
|
||||
df.at[idx, "ordnance_survey_lexiscore"] = (
|
||||
"postcode not found in ordnance survey"
|
||||
)
|
||||
continue
|
||||
|
||||
# Add business logic to do handling
|
||||
# TODO: Copy and do ordant survey logic
|
||||
# TODO: Save new results to s3 ( ask Khalim if we want to save to db)
|
||||
postcode_cache = check_if_post_code_exists_in_db_cache(postcode)
|
||||
if postcode_cache.empty:
|
||||
logger.warning(f"No OS Places data for {postcode}")
|
||||
for idx in group.index:
|
||||
df.at[idx, "ordnance_survey_address"] = (
|
||||
"postcode not found in ordnance survey"
|
||||
)
|
||||
df.at[idx, "ordnance_survey_uprn"] = (
|
||||
"postcode not found in ordnance survey"
|
||||
)
|
||||
df.at[idx, "ordnance_survey_lexiscore"] = (
|
||||
"postcode not found in ordnance survey"
|
||||
)
|
||||
continue
|
||||
|
||||
for idx, row in group.iterrows():
|
||||
user_address = str(row.get("user_input", "")).strip()
|
||||
if not user_address:
|
||||
continue
|
||||
|
||||
# Score against OS Places addresses
|
||||
scores = postcode_cache["ADDRESS"].apply(
|
||||
lambda addr: addressMatch.score(user_address, addr)
|
||||
)
|
||||
best_idx = scores.idxmax()
|
||||
best_score = scores[best_idx]
|
||||
|
||||
df.at[idx, "ordnance_survey_address"] = postcode_cache.at[
|
||||
best_idx, "ADDRESS"
|
||||
]
|
||||
df.at[idx, "ordnance_survey_uprn"] = postcode_cache.at[best_idx, "UPRN"]
|
||||
df.at[idx, "ordnance_survey_lexiscore"] = best_score
|
||||
|
||||
# TODO: Save new results to s3 (ask Khalim if we want to save to db)
|
||||
df.to_csv("ordnance_survey_results.csv", index=False)
|
||||
print(f"Results saved to ordnance_survey_results.csv ({len(df)} rows)")
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue