diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 7d52c562..53e50617 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -129,7 +129,9 @@ def get_uprn_candidates( user_norm = addressMatch.normalise_address(user_address) - out["lexiscore"] = out[address_column].apply(lambda x: addressMatch.levenshtein(user_norm, x)) + out["lexiscore"] = out[address_column].apply( + lambda x: addressMatch.levenshtein(user_norm, x) + ) # Normalise UPRN to string out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True) @@ -346,7 +348,7 @@ def handler(event, context, local=False): { "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", "sub_task_id": "6a427b6e-1ece-4983-b1e5-9bffccc53d1d", - "s3_uri": "s3://retrofit-data-dev/ara_postcode_splitter_batches/e31f2f21-175b-4a91-a3ec-a6baa325e917/8673913b-1a88-42d7-8578-0449123d94b0/2026-02-16T12:00:20.257856_7b520c0e.csv", + "s3_uri": "s3://retrofit-data-dev/ara_postcode_splitter_batches/e31f2f21-175b-4a91-a3ec-a6baa325e917/8673913b-1a88-42d7-8578-0449123d94b0/2026-02-18T11:47:00.822579_f95467f5.csv", } ) } @@ -507,9 +509,9 @@ def handler(event, context, local=False): results_data.append( { **row, # Include all original data - "uprn": uprn, - "domna_found_address": found_address, - "domna_lexiscore": score, + "address2uprn_uprn": uprn, + "address2uprn_address": found_address, + "address2uprn_lexiscore": score, } ) else: @@ -519,9 +521,9 @@ def handler(event, context, local=False): results_data.append( { **row, # Include all original data - "uprn": None, - "domna_found_address": None, - "domna_lexiscore": None, + "address2uprn_uprn": None, + "address2uprn_address": None, + "address2uprn_lexiscore": None, } ) @@ -533,9 +535,9 @@ def handler(event, context, local=False): results_data.append( { **row, - "uprn": None, - "domna_found_address": None, - "domna_lexiscore": None, + "address2uprn_uprn": None, + "address2uprn_address": None, + "address2uprn_lexiscore": None, "error": str(e), } ) diff --git a/backend/ordanceSurvey/main.py b/backend/ordanceSurvey/main.py index 5961aa16..4200bd24 100644 --- a/backend/ordanceSurvey/main.py +++ b/backend/ordanceSurvey/main.py @@ -44,10 +44,7 @@ def check_if_post_code_exists_in_db_cache(postcode): if response.get("status") != 200 or "data" not in response: logger.error(f"OS Places API failed for {postcode}: {response}") - raise RuntimeError( - "A postcode that doesn't exists in ordant survey and check if its real in postcode validator!!! Postcode: {postcode}" - ) - return None + return pd.DataFrame() # Save to cache new_record = PostcodeSearchModel( @@ -77,7 +74,7 @@ def handler(body: dict[str, Any], context: Any, local: bool = False) -> None: body = { "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", "sub_task_id": "8673913b-1a88-42d7-8578-0449123d94b0", - "s3_uri": "s3://retrofit-data-dev/ara_raw_outputs/e31f2f21-175b-4a91-a3ec-a6baa325e917/09cc7368-0850-4145-8b04-ebd84b3263c4/2026-02-18T14:00:13.228611_d2f675c3.csv", + "s3_uri": "s3://retrofit-data-dev/ara_raw_outputs/e31f2f21-175b-4a91-a3ec-a6baa325e917/6a427b6e-1ece-4983-b1e5-9bffccc53d1d/2026-03-04T16:48:22.339995_634c88fc.csv", } s3_uri: str = body.get("s3_uri", "") @@ -91,25 +88,72 @@ def handler(body: dict[str, Any], context: Any, local: bool = False) -> None: # Assumption designing with address2uprn was ran first csv_data = read_csv_from_s3_dict(bucket, key) df = pd.DataFrame(csv_data) - df["domna_lexiscore"] = pd.to_numeric(df["domna_lexiscore"], errors="coerce") + df["address2uprn_lexiscore"] = pd.to_numeric( + df["address2uprn_lexiscore"], errors="coerce" + ) needs_processing = df[ - df["domna_lexiscore"].isna() | (df["domna_lexiscore"] < lexiscore_threshold) + df["address2uprn_lexiscore"].isna() + | (df["address2uprn_lexiscore"] < lexiscore_threshold) ] grouped = needs_processing.groupby("postcode_clean") + # Initialise new columns + df["ordnance_survey_address"] = None + df["ordnance_survey_uprn"] = None + df["ordnance_survey_lexiscore"] = None + # Process each postcode group at a time for postcode, group in grouped: print(f"Processing postcode: {postcode} ({len(group)} rows)") valid_group = addressMatch.is_valid_postcode(postcode) - if valid_group: - postcode_cache = None - if postcode_cache is None: - postcode_cache = get_ordance_survey_record(postcode) - for index, row in group.iterrows(): - print("do something") - break + if not valid_group: + logger.warning(f"Postcode {postcode} is invalid, skipping") + for idx in group.index: + df.at[idx, "ordnance_survey_address"] = ( + "postcode not found in ordnance survey" + ) + df.at[idx, "ordnance_survey_uprn"] = ( + "postcode not found in ordnance survey" + ) + df.at[idx, "ordnance_survey_lexiscore"] = ( + "postcode not found in ordnance survey" + ) + continue - # Add business logic to do handling - # TODO: Copy and do ordant survey logic - # TODO: Save new results to s3 ( ask Khalim if we want to save to db) + postcode_cache = check_if_post_code_exists_in_db_cache(postcode) + if postcode_cache.empty: + logger.warning(f"No OS Places data for {postcode}") + for idx in group.index: + df.at[idx, "ordnance_survey_address"] = ( + "postcode not found in ordnance survey" + ) + df.at[idx, "ordnance_survey_uprn"] = ( + "postcode not found in ordnance survey" + ) + df.at[idx, "ordnance_survey_lexiscore"] = ( + "postcode not found in ordnance survey" + ) + continue + + for idx, row in group.iterrows(): + user_address = str(row.get("user_input", "")).strip() + if not user_address: + continue + + # Score against OS Places addresses + scores = postcode_cache["ADDRESS"].apply( + lambda addr: addressMatch.score(user_address, addr) + ) + best_idx = scores.idxmax() + best_score = scores[best_idx] + + df.at[idx, "ordnance_survey_address"] = postcode_cache.at[ + best_idx, "ADDRESS" + ] + df.at[idx, "ordnance_survey_uprn"] = postcode_cache.at[best_idx, "UPRN"] + df.at[idx, "ordnance_survey_lexiscore"] = best_score + + # TODO: Save new results to s3 (ask Khalim if we want to save to db) + df.to_csv("ordnance_survey_results.csv", index=False) + print(f"Results saved to ordnance_survey_results.csv ({len(df)} rows)")