import os import sys print("=" * 60) print("ENVIRONMENT AT STARTUP:") print("=" * 60) for k, v in sorted(os.environ.items()): print(f"{k}={v}") print("=" * 60) try: import json print("✓ json imported") import pandas as pd print("✓ pandas imported") import requests print("✓ requests imported") from uuid import UUID print("✓ UUID imported") from urllib.parse import unquote print("✓ urllib.parse imported") from utils.s3 import read_csv_from_s3 as read_csv_from_s3_dict print("✓ utils.s3 imported") from tqdm import tqdm print("✓ tqdm imported") from backend.app.db.functions.tasks.Tasks import SubTaskInterface print("✓ SubTaskInterface imported") from backend.address2UPRN.main import ( resolve_uprns_for_postcode_group, get_epc_data_with_postcode, ) print("✓ backend.address2UPRN imported") except Exception as e: print(f"✗ IMPORT ERROR: {type(e).__name__}: {e}") import traceback traceback.print_exc() raise def parse_s3_console_url(s3_uri: str) -> tuple[str, str]: """ Parse AWS console S3 URL to extract bucket and key. Format: https://account-id-hash.region.console.aws.amazon.com/s3/object/bucket?region=...&prefix=path """ if "console.aws.amazon.com" in s3_uri and "?prefix=" in s3_uri: base, query = s3_uri.split("?", 1) path_parts = base.split("/s3/object/") if len(path_parts) > 1: bucket = path_parts[1] params = dict(item.split("=") for item in query.split("&") if "=" in item) key = unquote(params.get("prefix", "")) return bucket, key raise ValueError(f"Could not parse S3 URI: {s3_uri}") def sanitise_postcode(postcode: str) -> str | None: """ Normalise postcode for grouping. - Uppercase - Remove all whitespace """ if pd.isna(postcode): return None return postcode.upper().replace(" ", "") def is_valid_postcode(postcode_clean: str) -> bool: """ Validate postcode using postcodes.io. Expects a sanitised postcode (e.g. E84SQ). Returns True if valid, False otherwise. """ POSTCODES_IO_VALIDATE_URL = "https://api.postcodes.io/postcodes/{postcode}/validate" if not postcode_clean: return False try: resp = requests.get( POSTCODES_IO_VALIDATE_URL.format(postcode=postcode_clean), timeout=5, ) resp.raise_for_status() return resp.json().get("result", False) except requests.RequestException: # Network issues, rate limits, etc. return False def main(): df = pd.read_excel("hackney.xlsx", sheet_name="Sustainability") df = df.head(500) # Sanitise postcodes df["postcode_clean"] = df["Postcode"].apply(sanitise_postcode) # --- validate AFTER grouping (save API calls) --- # Get unique, non-null postcodes unique_postcodes = df["postcode_clean"].dropna().unique() # Validate each postcode once, TODOadd a progress bar postcode_validity = { pc: is_valid_postcode(pc) for pc in tqdm(unique_postcodes, total=len(unique_postcodes)) } # Map validity back onto dataframe df["postcode_valid"] = df["postcode_clean"].map(postcode_validity) results = [] for postcode, group_df in tqdm( df[df["postcode_valid"]].groupby("postcode_clean"), desc="Resolving UPRNs by postcode", ): try: epc_df = get_epc_data_with_postcode(postcode) if epc_df.empty: tmp = group_df.copy() tmp["found_uprn"] = None tmp["status"] = "no_epc_results" results.append(tmp) continue resolved = resolve_uprns_for_postcode_group( group_df=group_df, epc_df=epc_df, ) results.append(resolved) except Exception as e: tmp = group_df.copy() tmp["found_uprn"] = None tmp["status"] = "exception" tmp["error"] = str(e) results.append(tmp) final_df = pd.concat(results, ignore_index=True) a = final_df[ [ "best_match_lexiscore", "Address 1", "best_match_address", "Postcode", "UPRN", "best_match_uprn", ] ] # add levi score to viewing b = final_df[final_df["best_match_lexiscore"] > 0] # add levi score to viewing b = b[ [ "best_match_lexiscore", "Address 1", "best_match_address", "Postcode", "UPRN", "best_match_uprn", ] ] def handler(event, context): print("=" * 60) print("HANDLER INVOKED") print("=" * 60) print(f"Function: {context.function_name}") print(f"Request ID: {context.aws_request_id}") print(f"Event received: {type(event)}") print(f"Event keys: {event.keys() if isinstance(event, dict) else 'N/A'}") # Example SQS message for testing (copy and paste into SQS): # { # "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", # "s3_uri": "https://337213553626-7ovirzjr.eu-west-2.console.aws.amazon.com/s3/object/retrofit-data-dev?region=eu-west-2&prefix=ara_raw_inputs/peabody/2025_11_11+-+Peabody+-+Data+Extracts+for+Domna_transformed.csv", # } # Handle both single event and batch events (SQS, etc.) print("Extracting records from event...") records = event.get("Records", [event]) print(f"Found {len(records)} record(s) to process") results = [] errors = [] print("Initializing SubTaskInterface...") subtask_interface = SubTaskInterface() print("✓ SubTaskInterface initialized") for record in records: print("Processing record...") task_id = None subtask_id = None try: # Parse body print("Parsing body from record...") if isinstance(record.get("body"), str): body = json.loads(record["body"]) else: body = record.get("body", {}) print(f"Body parsed: {body}") # Validate required fields task_id = body.get("task_id") s3_uri = body.get("s3_uri") print(f"task_id: {task_id}, s3_uri: {s3_uri}") if not task_id: errors.append({"error": "Missing required field: task_id"}) continue if not s3_uri: errors.append({"error": "Missing required field: s3_uri"}) continue # Convert task_id to UUID print("Converting task_id to UUID...") try: task_id = UUID(task_id) if isinstance(task_id, str) else task_id print(f"UUID conversion successful: {task_id}") except ValueError as e: errors.append({"error": f"Invalid UUID format for task_id: {str(e)}"}) continue # Create a new subtask for this postcode splitter invocation print(f"Creating subtask for task {task_id}...") subtask_id = subtask_interface.create_subtask( task_id=task_id, inputs={"s3_uri": s3_uri} ) print(f"Created subtask {subtask_id} for task {task_id}") # Process normal flow print(f"Processing task_id: {task_id}") print(f"Processing s3_uri: {s3_uri}") # Read CSV from S3 print("Parsing S3 URI...") bucket, key = parse_s3_console_url(s3_uri) print(f"Bucket: {bucket}, Key: {key}") print("Fetching CSV from S3...") csv_data = read_csv_from_s3_dict(bucket, key) print(f"CSV fetched: {len(csv_data)} rows") print("Creating DataFrame...") df = pd.DataFrame(csv_data) print(f"DataFrame created: {len(df)} rows, {len(df.columns)} columns") # Get head for demo print("Getting DataFrame head...") df_head = df.head() print("DataFrame head:") print(df_head) df_head_dict = df_head.to_dict("records") print("Appending result...") results.append( { "message": "Postcode splitter processing started", "task_id": str(task_id), "s3_uri": s3_uri, "subtask_id": str(subtask_id), } ) print("Result appended") # Mark subtask as complete after successful processing print("Updating subtask status to complete...") subtask_interface.update_subtask_status( subtask_id, "complete", outputs={ "status": "processing_complete", "s3_uri": s3_uri, "rows_processed": len(df), }, ) print(f"Subtask {subtask_id} marked as complete") except json.JSONDecodeError as e: errors.append({"error": "Invalid JSON in request body", "details": str(e)}) # Mark subtask as failed if we have one if subtask_id: try: subtask_interface.update_subtask_status( subtask_id, "failed", outputs={"error": str(e)} ) except Exception as db_error: print(f"Failed to update subtask status: {db_error}") except Exception as e: print(f"Unexpected error processing record: {e}") errors.append({"error": "Unexpected error", "details": str(e)}) # Mark subtask as failed if we have one if subtask_id: try: subtask_interface.update_subtask_status( subtask_id, "failed", outputs={"error": str(e)} ) except Exception as db_error: print(f"Failed to update subtask status: {db_error}") # Return error if all records failed if errors and not results: return {"statusCode": 500, "body": json.dumps({"errors": errors})} return { "statusCode": 200, "body": json.dumps( {"processed": results, "errors": errors if errors else None} ), } if __name__ == "__main__": main()