This commit is contained in:
Jun-te Kim 2026-03-09 13:23:20 +00:00
parent 1b9c26a2b6
commit 4d013f3295
3 changed files with 18 additions and 20 deletions

View file

@ -14,7 +14,7 @@ from utils.s3 import (
)
from datetime import datetime
from backend.utils.addressMatch import addressMatch
from backend.utils.addressMatch import AddressMatch
logger = setup_logger()
@ -35,7 +35,7 @@ def score_addresses(
if column not in df.columns:
raise ValueError(f"Missing column: {column}")
return df[column].apply(lambda x: addressMatch.score(user_address, x))
return df[column].apply(lambda x: AddressMatch.score(user_address, x))
def get_epc_data_with_postcode(postcode, size=500, attempt=1, max_attempts=3):
@ -127,10 +127,10 @@ def get_uprn_candidates(
out = df.copy()
user_norm = addressMatch.normalise_address(user_address)
user_norm = AddressMatch.normalise_address(user_address)
out["lexiscore"] = out[address_column].apply(
lambda x: addressMatch.levenshtein(user_norm, x)
lambda x: AddressMatch.levenshtein(user_norm, x)
)
# Normalise UPRN to string
@ -455,7 +455,7 @@ def handler(event, context, local=False):
)
# Validate postcode before processing
if not addressMatch.is_valid_postcode(postcode):
if not AddressMatch.is_valid_postcode(postcode):
logger.warning(f"Postcode {postcode} is invalid, skipping")
continue

View file

@ -1,4 +1,4 @@
from typing import Any
from typing import Any, Optional
import json
from utils.logger import setup_logger
import logging
@ -8,7 +8,7 @@ from utils.s3 import (
read_csv_from_s3 as read_csv_from_s3_dict,
parse_s3_uri,
)
from backend.utils.addressMatch import addressMatch
from backend.utils.addressMatch import AddressMatch
from backend.app.db.connection import get_db_session
from backend.app.db.models.postcode_search import PostcodeSearchModel
from backend.utils.ordnance_survey import (
@ -124,7 +124,7 @@ def handler(body: dict[str, Any], context: Any, local: bool = False) -> None:
s3_uri: str = body.get("s3_uri", "")
lexiscore_threshold: float = body.get("lexiscore_threshold", 0.5)
lexiscore_column: str = body.get("lexiscore_column", None)
lexiscore_column: Optional[str] = body.get("lexiscore_column", None)
task_id: str = body.get("task_id", "")
sub_task_id: str = body.get("sub_task_id", "")
@ -158,7 +158,7 @@ def handler(body: dict[str, Any], context: Any, local: bool = False) -> None:
# Process each postcode group at a time
for postcode, group in grouped:
print(f"Processing postcode: {postcode} ({len(group)} rows)")
valid_group = addressMatch.is_valid_postcode(postcode)
valid_group = AddressMatch.is_valid_postcode(postcode)
if not valid_group:
logger.warning(f"Postcode {postcode} is invalid, skipping")
for idx in group.index:
@ -203,7 +203,7 @@ def handler(body: dict[str, Any], context: Any, local: bool = False) -> None:
# Score against OS Places addresses
scores = postcode_cache["ADDRESS"].apply(
lambda addr: addressMatch.score(ordnancy_survey_user_input, addr)
lambda addr: AddressMatch.score(ordnancy_survey_user_input, addr)
)
best_idx = scores.idxmax()
best_score = scores[best_idx]
@ -215,12 +215,10 @@ def handler(body: dict[str, Any], context: Any, local: bool = False) -> None:
df.at[idx, "ordnance_survey_lexiscore"] = best_score
# Save results locally
df.to_csv("ordnance_survey_results.csv", index=False)
print(f"Results saved to ordnance_survey_results.csv ({len(df)} rows)")
if local:
df.to_csv("ordnance_survey_results.csv", index=False)
print(f"Results saved to ordnance_survey_results.csv ({len(df)} rows)")
# Save results to S3
if task_id and sub_task_id:
try:
save_results_to_s3(df, task_id, sub_task_id)
except Exception as s3_error:
logger.error(f"Failed to save results to S3: {s3_error}")
save_results_to_s3(df, task_id, sub_task_id)

View file

@ -4,13 +4,13 @@ from difflib import SequenceMatcher
import requests
class addressMatch:
class AddressMatch:
def __init__(self):
return None
@staticmethod
def score(a: str, b: str) -> float:
score: float = addressMatch.levenshtein(a, b)
score: float = AddressMatch.levenshtein(a, b)
return score
@ -143,8 +143,8 @@ class addressMatch:
return None
a_norm = addressMatch.normalise_address(a)
b_norm = addressMatch.normalise_address(b)
a_norm = AddressMatch.normalise_address(a)
b_norm = AddressMatch.normalise_address(b)
# --- hard signal: numbers ---
nums_a = extract_numbers(a_norm)