from typing import Optional from epc_api.client import EpcClient import os from urllib.parse import urlencode import pandas as pd from utils.logger import setup_logger import json from uuid import UUID import uuid from backend.app.db.functions.tasks.Tasks import SubTaskInterface from utils.s3 import ( save_csv_to_s3, read_csv_from_s3 as read_csv_from_s3_dict, parse_s3_uri, ) from datetime import datetime from backend.utils.addressMatch import AddressMatch logger = setup_logger() EPC_AUTH_TOKEN = os.getenv( "EPC_AUTH_TOKEN", ) if EPC_AUTH_TOKEN is None: raise RuntimeError("EPC_AUTH_TOKEN not defined in env") def score_addresses( df: pd.DataFrame, user_address: str, column: str = "address", ) -> pd.Series: if column not in df.columns: raise ValueError(f"Missing column: {column}") return df[column].apply(lambda x: AddressMatch.score(user_address, x)) def get_epc_data_with_postcode(postcode, size=500, attempt=1, max_attempts=3): """ Recursively fetch EPC data by postcode. If results hit the size limit, retry with double size up to max_attempts. """ client = EpcClient(auth_token=EPC_AUTH_TOKEN) url = os.path.join(client.domestic.host, "search") if size: url += "?" + urlencode({"size": size}) search_resp = client.domestic.call( url=url, method="get", params={"postcode": postcode}, ) if not search_resp or "rows" not in search_resp: return pd.DataFrame() results_df = pd.DataFrame(search_resp["rows"], columns=search_resp["column-names"]) row_count = len(results_df) # If we hit the size limit, there *may* be more results if row_count == size: print( f"⚠️ Warning: hit size limit ({size}) for postcode '{postcode}'. " f"Attempt {attempt}/{max_attempts}." ) if attempt < max_attempts: print(f"🔁 Retrying with size={size * 2}") return get_epc_data_with_postcode( postcode=postcode, size=size * 2, attempt=attempt + 1, max_attempts=max_attempts, ) else: print( "🚨 Max attempts reached. Results may be truncated. " "(Please do a manual review by the tech team.)" ) return results_df def df_has_single_uprn(df: pd.DataFrame, uprn: str, column: str = "uprn") -> bool: """ Returns True if all non-null UPRNs in df match the given uprn. Returns False otherwise. """ if column not in df.columns: return False # Drop nulls and normalise to string uprns = df[column].dropna().astype(str).str.strip().unique() # No valid UPRNs to compare if len(uprns) == 0: return False # Exactly one unique UPRN and it matches return len(uprns) == 1 and uprns[0] == str(uprn) def get_uprn_candidates( df: pd.DataFrame, user_address: str, address_column: str = "address", uprn_column: str = "uprn", ) -> pd.DataFrame: """ Annotate EPC results with lexicographical similarity scores and ranks. Returns a DataFrame sorted by descending lexiscore. DOES NOT choose or return a UPRN. """ if address_column not in df.columns: raise ValueError(f"Missing column: {address_column}") if uprn_column not in df.columns: raise ValueError(f"Missing column: {uprn_column}") out = df.copy() user_norm = AddressMatch.normalise_address(user_address) out["lexiscore"] = out[address_column].apply( lambda x: AddressMatch.levenshtein(user_norm, x) ) # Normalise UPRN to string out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True) # Rank: 1 = best match out["lexirank"] = out["lexiscore"].rank(method="dense", ascending=False).astype(int) return out.sort_values( ["lexirank", "lexiscore"], ascending=[True, False], ) def get_uprn_with_epc_df( user_inputed_address: str, epc_df: pd.DataFrame, verbose: bool = False, ): """ Return uprn (str) using a pre-fetched EPC dataframe. This avoids calling the API multiple times for the same postcode. """ if epc_df.empty: return None scored_df = get_uprn_candidates( epc_df, user_address=user_inputed_address, ) # Best score best_score = scored_df.iloc[0]["lexiscore"] # # Return None if score is below threshold # if best_score < 0.7: # return None # All rank-1 rows (possible draw) top_rank_df = scored_df[scored_df["lexirank"] == 1] # If rank-1 rows do not agree on a single UPRN → ambiguous if not df_has_single_uprn(top_rank_df, uprn=top_rank_df.iloc[0]["uprn"]): return None address = top_rank_df["address"].values[0] score = float(top_rank_df["lexiscore"].values[0]) logger.info(f"Address found to be: {address}, with lexiscore {score}") # Safe to return the agreed UPRN found_uprn = top_rank_df.iloc[0]["uprn"] if found_uprn == "": return None if verbose: return (found_uprn, address, score) else: return found_uprn def get_uprn( user_inputed_address: str, postcode: str, verbose: bool = False, ): """ Return uprn (str) Return False if failed to find a sensible matching epc Return None when epc found but no UPRN This function fetches EPC data via API for a single postcode. For processing multiple addresses in the same postcode, use get_uprn_with_epc_df instead. """ df = get_epc_data_with_postcode(postcode=postcode) return get_uprn_with_epc_df( user_inputed_address=user_inputed_address, epc_df=df, verbose=verbose, ) def resolve_uprns_for_postcode_group( group_df: pd.DataFrame, epc_df: pd.DataFrame, address_col: str = "Address 1", ) -> pd.DataFrame: """ Given: - group_df: rows sharing the same postcode - epc_df: EPC search results for that postcode Returns: group_df + found_uprn + diagnostics """ results = [] for _, row in group_df.iterrows(): user_address = str(row[address_col]).strip() scored_df = get_uprn_candidates( epc_df, user_address=user_address, ) if scored_df.empty: results.append( { "found_uprn": None, "best_match_uprn": None, "best_match_address": None, "best_match_lexiscore": None, "status": "no_epc_candidates", } ) continue best_score = scored_df.iloc[0]["lexiscore"] if best_score <= 0: results.append( { "found_uprn": None, "best_match_uprn": None, "best_match_address": None, "best_match_lexiscore": best_score, "status": "zero_score", } ) continue top_rank_df = scored_df[scored_df["lexirank"] == 1] if not df_has_single_uprn(top_rank_df, top_rank_df.iloc[0]["uprn"]): results.append( { "found_uprn": None, "best_match_uprn": top_rank_df.iloc[0]["uprn"], "best_match_address": top_rank_df.iloc[0]["address"], "best_match_lexiscore": best_score, "status": "ambiguous", } ) continue results.append( { "found_uprn": str(top_rank_df.iloc[0]["uprn"]), "best_match_uprn": str(top_rank_df.iloc[0]["uprn"]), "best_match_address": top_rank_df.iloc[0]["address"], "best_match_lexiscore": best_score, "status": "matched", } ) return pd.concat( [group_df.reset_index(drop=True), pd.DataFrame(results)], axis=1, ) def save_results_to_s3( results_df: pd.DataFrame, task_id: str, sub_task_id: str, bucket_name: Optional[str] = None, ) -> bool: """ Save results DataFrame to S3 as CSV. :param results_df: The DataFrame containing results :param task_id: The task ID (used for file naming) :param bucket_name: The S3 bucket name (defaults to env variable) :return: True if successful, False otherwise """ if bucket_name is None: bucket_name = os.getenv("S3_BUCKET_NAME") if not bucket_name: logger.error( "S3 bucket name not provided and S3_BUCKET_NAME environment variable not set" ) return False try: # Create a filename with the task ID file_name = f"{datetime.now().isoformat()}_{str(uuid.uuid4())[:8]}" file_key = f"ara_raw_outputs/{task_id}/{sub_task_id}/{file_name}.csv" # Save to S3 success = save_csv_to_s3(results_df, bucket_name, file_key) if success: logger.info(f"Successfully saved results to s3://{bucket_name}/{file_key}") return True else: logger.error(f"Failed to save results to S3") return False except Exception as e: logger.error(f"Error saving results to S3: {str(e)}") return False def handler(event, context, local=False): print("=== Address2UPRN Lambda Handler ===") print(f"Function: {context.function_name}") print(f"Request ID: {context.aws_request_id}") # Handle local testing if local is True: event = { "Records": [ { "body": json.dumps( { "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", "sub_task_id": "6a427b6e-1ece-4983-b1e5-9bffccc53d1d", "s3_uri": "s3://retrofit-data-dev/ara_postcode_splitter_batches/e31f2f21-175b-4a91-a3ec-a6baa325e917/8673913b-1a88-42d7-8578-0449123d94b0/2026-02-18T11:47:00.822579_f95467f5.csv", } ) } ] } print(f"Event: {json.dumps(event, indent=2, default=str)}") print("===================================") # Handle both single event and batch events (SQS, etc.) records = event.get("Records", [event]) results = [] errors = [] subtask_interface = SubTaskInterface() for record in records: task_id = None subtask_id = None try: # Parse body (inputs) if isinstance(record.get("body"), str): body = json.loads(record["body"]) else: body = record.get("body", {}) # Validate required fields task_id = body.get("task_id") subtask_id = body.get("sub_task_id") s3_uri = body.get("s3_uri") if not task_id: errors.append({"error": "Missing required field: task_id"}) continue if not subtask_id: errors.append({"error": "Missing required field: sub_task_id"}) continue if not s3_uri: errors.append({"error": "Missing required field: s3_uri"}) continue # Convert task_id to UUID try: task_id = UUID(task_id) if isinstance(task_id, str) else task_id except ValueError as e: errors.append({"error": f"Invalid UUID format for task_id: {str(e)}"}) continue # Convert sub_task_id to UUID try: subtask_id = ( UUID(subtask_id) if isinstance(subtask_id, str) else subtask_id ) except ValueError as e: errors.append( {"error": f"Invalid UUID format for sub_task_id: {str(e)}"} ) continue # Update existing subtask to 'in progress' subtask_interface.update_subtask_status(subtask_id, "in progress") logger.info(f"Processing subtask {subtask_id} for task {task_id}") # Parse S3 URI and read CSV from S3 logger.info(f"Reading data from S3: {s3_uri}") try: bucket, key = parse_s3_uri(s3_uri) csv_data = read_csv_from_s3_dict(bucket, key) df = pd.DataFrame(csv_data) logger.info(f"Loaded {len(df)} rows from S3") except Exception as s3_error: logger.error(f"Failed to read data from S3: {s3_error}") errors.append( {"error": "Failed to read data from S3", "details": str(s3_error)} ) try: subtask_interface.update_subtask_status( subtask_id, "failed", outputs={"error": str(s3_error)} ) except Exception as db_error: logger.error(f"Failed to update subtask status: {db_error}") continue # Process the rows logger.info(f"Processing {len(df)} rows for task {task_id}") clean_df = df.dropna(subset=["postcode_clean"]) postcode_to_addresses = { postcode: group.to_dict(orient="records") for postcode, group in clean_df.groupby("postcode_clean", sort=False) } logger.info(f"Total postcodes: {len(postcode_to_addresses)}") # Process each postcode group results_data = [] for postcode, postcode_rows in postcode_to_addresses.items(): logger.info( f"Processing postcode: {postcode} with {len(postcode_rows)} rows" ) # Validate postcode before processing if not AddressMatch.is_valid_postcode(postcode): logger.warning(f"Postcode {postcode} is invalid, skipping") continue # Fetch EPC data once per postcode try: epc_df = get_epc_data_with_postcode(postcode=postcode) logger.info( f"Fetched {len(epc_df)} EPC records for postcode {postcode}" ) except Exception as e: logger.error( f"Failed to fetch EPC data for postcode {postcode}: {e}" ) continue # Process each address in this postcode with the same EPC data for row in postcode_rows: try: # Concatenate Address columns directly address2uprn_user_input = ( str(row.get("Address 1", "")).strip() + " " + str(row.get("Address 2", "")).strip() + " " + str(row.get("Address 3", "")).strip() ).strip() if not address2uprn_user_input: logger.warning( f"Skipping row with missing address components for postcode {postcode}" ) continue # Get UPRN using the pre-fetched EPC data with all return options result = get_uprn_with_epc_df( user_inputed_address=address2uprn_user_input, epc_df=epc_df, verbose=True, ) # Parse result tuple if successful if result: uprn, found_address, score = result logger.info( f"Found UPRN for {address2uprn_user_input} in {postcode}: {uprn} (score: {score})" ) results_data.append( { **row, # Include all original data "address2uprn_uprn": uprn, "address2uprn_address": found_address, "address2uprn_lexiscore": score, } ) else: logger.warning( f"No UPRN found for {address2uprn_user_input} in {postcode}" ) results_data.append( { **row, # Include all original data "address2uprn_uprn": None, "address2uprn_address": None, "address2uprn_lexiscore": None, } ) except Exception as e: logger.error( f"Error processing address {row.get('address2uprn_user_input', 'unknown')}: {e}" ) # Still add the row with error markers results_data.append( { **row, "address2uprn_uprn": None, "address2uprn_address": None, "address2uprn_lexiscore": None, "error": str(e), } ) continue # Create results DataFrame result_df = pd.DataFrame(results_data) # Save results to S3 try: save_results_to_s3(result_df, str(task_id), str(subtask_id)) except Exception as s3_error: logger.error(f"Failed to save results to S3: {s3_error}") # Mark subtask as completed try: subtask_interface.update_subtask_status( subtask_id, "completed", outputs={"rows_processed": "todo -> show sensible output"}, ) logger.info(f"Marked subtask {subtask_id} as completed") except Exception as db_error: logger.error(f"Failed to mark subtask as completed: {db_error}") except Exception as e: logger.error(f"Unexpected error processing record: {e}", exc_info=True) errors.append({"error": "Unexpected error", "details": str(e)}) # Mark subtask as failed if we have one if subtask_id: try: subtask_interface.update_subtask_status( subtask_id, "failed", outputs={"error": str(e)} ) except Exception as db_error: logger.error(f"Failed to update subtask status: {db_error}") # Return error if all records failed logger.info(results_data) logger.info(results) if errors and not results: return {"statusCode": 500, "body": json.dumps({"errors": errors})} return { "statusCode": 200, "body": json.dumps( {"processed": results, "errors": errors if errors else None} ), } # TODO: # Don't add results to return messages as its too verbose # capture the exepection as e, into s3, to find the logs go to s3 # Upload results to s3 as well as csv