Model/backend/ordnanceSurvey/main.py

from typing import Any, Optional
import json
from utils.logger import setup_logger
import logging
from backend.utils.subtasks import subtask_handler
from utils.s3 import (
    save_csv_to_s3,
    read_csv_from_s3 as read_csv_from_s3_dict,
    parse_s3_uri,
)
from backend.utils.addressMatch import AddressMatch
from backend.app.db.connection import get_db_session
from backend.app.db.models.postcode_search import PostcodeSearchModel
from backend.ordnanceSurvey.helpers import (
    lookup_os_places,
    os_places_results_to_dataframe,
)
from backend.app.config import get_settings
from sqlalchemy import select
from datetime import datetime
import uuid
import os

import pandas as pd

logger: logging.Logger = setup_logger()


def check_if_post_code_exists_in_db_cache(postcode):

    with get_db_session() as session:
        result = (
            session.execute(
                select(PostcodeSearchModel).where(
                    PostcodeSearchModel.postcode == postcode
                )
            )
            .scalars()
            .first()
        )
        if result:
            return os_places_results_to_dataframe(result.result_data)

        # Cache miss — fetch from OS Places API
        api_key = get_settings().ORDNANCE_SURVEY_API_KEY
        response = lookup_os_places(postcode, api_key)

        if response.get("status") != 200 or "data" not in response:
            logger.error(f"OS Places API failed for {postcode}: {response}")
            return pd.DataFrame()

        # Save to cache
        new_record = PostcodeSearchModel(
            postcode=postcode,
            result_data=response["data"],
        )
        session.add(new_record)
        session.commit()

        return os_places_results_to_dataframe(response["data"])


def save_results_to_s3(
    results_df: pd.DataFrame,
    task_id: str,
    sub_task_id: str,
    bucket_name: Optional[str] = None,
) -> bool:
    """
    Save results DataFrame to S3 as CSV in a parent folder structure.

    :param results_df: The DataFrame containing results
    :param task_id: The task ID (used for file naming)
    :param sub_task_id: The subtask ID (used for file naming)
    :param bucket_name: The S3 bucket name (defaults to env variable)
    :return: True if successful, False otherwise
    """
    if bucket_name is None:
        bucket_name = os.getenv("S3_BUCKET_NAME")

    if not bucket_name:
        logger.error(
            "S3 bucket name not provided and S3_BUCKET_NAME environment variable not set"
        )
        return False

    try:
        # Create a filename with timestamp and UUID
        file_name = f"{datetime.now().isoformat()}_{str(uuid.uuid4())[:8]}"
        file_key = f"ara_ordnance_survey_outputs/{task_id}/{sub_task_id}/ordnanceSurvey/{file_name}.csv"

        # Save to S3
        success = save_csv_to_s3(results_df, bucket_name, file_key)

        if success:
            logger.info(f"Successfully saved results to s3://{bucket_name}/{file_key}")
            return True
        else:
            logger.error(f"Failed to save results to S3")
            return False

    except Exception as e:
        logger.error(f"Error saving results to S3: {str(e)}")
        return False


@subtask_handler()  # This assumes task_id and subtask_id is defined in event.Records.body
def handler(body: dict[str, Any], context: Any, local: bool = False) -> None:

    # delete this line after test
    # local = True
    # Example SQS message for testing (copy and paste into SQS):
    if local is True:
        body = {
            "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917",
            "sub_task_id": "8673913b-1a88-42d7-8578-0449123d94b0",
            "s3_uri": "s3://retrofit-data-dev/ara_raw_outputs/e31f2f21-175b-4a91-a3ec-a6baa325e917/6a427b6e-1ece-4983-b1e5-9bffccc53d1d/2026-03-04T16:48:22.339995_634c88fc.csv",
            "lexiscore_column": "address2uprn_lexiscore",
        }

    s3_uri: str = body.get("s3_uri", "")
    lexiscore_threshold: float = body.get("lexiscore_threshold", 0.5)
    lexiscore_column: Optional[str] = body.get("lexiscore_column", None)
    task_id: str = body.get("task_id", "")
    sub_task_id: str = body.get("sub_task_id", "")

    if s3_uri == "":
        raise RuntimeError("Missing s3_uri in message body")

    bucket, key = parse_s3_uri(s3_uri)

    # Assumption designing with address2uprn was ran first
    csv_data = read_csv_from_s3_dict(bucket, key)
    df = pd.DataFrame(csv_data)
    # df = df.head(5)

    # If lexiscore_column is specified, use it; otherwise process all rows
    if lexiscore_column and lexiscore_column in df.columns:
        df[lexiscore_column] = pd.to_numeric(df[lexiscore_column], errors="coerce")
        needs_processing = df[
            df[lexiscore_column].isna() | (df[lexiscore_column] < lexiscore_threshold)
        ]
    else:
        # Default: process all rows
        needs_processing = df

    grouped = needs_processing.groupby("postcode_clean")

    # Initialise new columns
    df["ordnance_survey_address"] = None
    df["ordnance_survey_uprn"] = None
    df["ordnance_survey_lexiscore"] = None

    # Process each postcode group at a time
    for postcode, group in grouped:
        print(f"Processing postcode: {postcode} ({len(group)} rows)")
        valid_group = AddressMatch.is_valid_postcode(postcode)
        if not valid_group:
            logger.warning(f"Postcode {postcode} is invalid, skipping")
            for idx in group.index:
                df.at[idx, "ordnance_survey_address"] = (
                    "postcode not found in ordnance survey"
                )
                df.at[idx, "ordnance_survey_uprn"] = (
                    "postcode not found in ordnance survey"
                )
                df.at[idx, "ordnance_survey_lexiscore"] = (
                    "postcode not found in ordnance survey"
                )
            continue

        postcode_cache = check_if_post_code_exists_in_db_cache(postcode)
        if postcode_cache.empty:
            logger.warning(f"No OS Places data for {postcode}")
            for idx in group.index:
                df.at[idx, "ordnance_survey_address"] = (
                    "postcode not found in ordnance survey"
                )
                df.at[idx, "ordnance_survey_uprn"] = (
                    "postcode not found in ordnance survey"
                )
                df.at[idx, "ordnance_survey_lexiscore"] = (
                    "postcode not found in ordnance survey"
                )
            continue

        for idx, row in group.iterrows():
            # Concatenate Address columns directly
            ordnancy_survey_user_input = (
                str(row.get("Address 1", "")).strip()
                + " "
                + str(row.get("Address 2", "")).strip()
                + " "
                + str(row.get("Address 3", "")).strip()
            ).strip()

            if not ordnancy_survey_user_input:
                continue

            # Score against OS Places addresses
            scores = postcode_cache["ADDRESS"].apply(
                lambda addr: AddressMatch.score(ordnancy_survey_user_input, addr)
            )
            best_idx = scores.idxmax()
            best_score = scores[best_idx]

            if best_score <= 0:
                continue

            df.at[idx, "ordnance_survey_address"] = postcode_cache.at[
                best_idx, "ADDRESS"
            ]
            df.at[idx, "ordnance_survey_uprn"] = postcode_cache.at[best_idx, "UPRN"]
            df.at[idx, "ordnance_survey_lexiscore"] = best_score

    # Save results locally
    if local:
        df.to_csv("ordnance_survey_results.csv", index=False)
        print(f"Results saved to ordnance_survey_results.csv ({len(df)} rows)")

    # Save results to S3
    if task_id and sub_task_id:
        save_results_to_s3(df, task_id, sub_task_id)