from typing import Optional

from epc_api.client import EpcClient
import os
from urllib.parse import urlencode
import pandas as pd
from utils.logger import setup_logger
import json
from uuid import UUID
import uuid
from backend.app.db.functions.tasks.Tasks import SubTaskInterface
from utils.s3 import (
    save_csv_to_s3,
    read_csv_from_s3 as read_csv_from_s3_dict,
    parse_s3_uri,
)
from datetime import datetime

from backend.utils.addressMatch import AddressMatch

logger = setup_logger()


EPC_AUTH_TOKEN = os.getenv(
    "EPC_AUTH_TOKEN",
)

if EPC_AUTH_TOKEN is None:
    raise RuntimeError("EPC_AUTH_TOKEN not defined in env")


def score_addresses(
    df: pd.DataFrame,
    user_address: str,
    column: str = "address",
) -> pd.Series:
    if column not in df.columns:
        raise ValueError(f"Missing column: {column}")

    return df[column].apply(lambda x: AddressMatch.score(user_address, x))


def get_epc_data_with_postcode(postcode, size=500, attempt=1, max_attempts=3):
    """
    Recursively fetch EPC data by postcode.
    If results hit the size limit, retry with double size up to max_attempts.
    """
    client = EpcClient(auth_token=EPC_AUTH_TOKEN)

    url = os.path.join(client.domestic.host, "search")

    if size:
        url += "?" + urlencode({"size": size})

    search_resp = client.domestic.call(
        url=url,
        method="get",
        params={"postcode": postcode},
    )
    if not search_resp or "rows" not in search_resp:
        return pd.DataFrame()

    results_df = pd.DataFrame(search_resp["rows"], columns=search_resp["column-names"])

    row_count = len(results_df)

    # If we hit the size limit, there *may* be more results
    if row_count == size:
        print(
            f"⚠️ Warning: hit size limit ({size}) for postcode '{postcode}'. "
            f"Attempt {attempt}/{max_attempts}."
        )

        if attempt < max_attempts:
            print(f"🔁 Retrying with size={size * 2}")
            return get_epc_data_with_postcode(
                postcode=postcode,
                size=size * 2,
                attempt=attempt + 1,
                max_attempts=max_attempts,
            )
        else:
            print(
                "🚨 Max attempts reached. Results may be truncated. "
                "(Please do a manual review by the tech team.)"
            )

    return results_df


def df_has_single_uprn(df: pd.DataFrame, uprn: str, column: str = "uprn") -> bool:
    """
    Returns True if all non-null UPRNs in df match the given uprn.
    Returns False otherwise.
    """

    if column not in df.columns:
        return False

    # Drop nulls and normalise to string
    uprns = df[column].dropna().astype(str).str.strip().unique()

    # No valid UPRNs to compare
    if len(uprns) == 0:
        return False

    # Exactly one unique UPRN and it matches
    return len(uprns) == 1 and uprns[0] == str(uprn)


def get_uprn_candidates(
    df: pd.DataFrame,
    user_address: str,
    address_column: str = "address",
    uprn_column: str = "uprn",
) -> pd.DataFrame:
    """
    Annotate EPC results with lexicographical similarity scores and ranks.

    Returns a DataFrame sorted by descending lexiscore.
    DOES NOT choose or return a UPRN.
    """

    if address_column not in df.columns:
        raise ValueError(f"Missing column: {address_column}")

    if uprn_column not in df.columns:
        raise ValueError(f"Missing column: {uprn_column}")

    out = df.copy()

    user_norm = AddressMatch.normalise_address(user_address)

    out["lexiscore"] = out[address_column].apply(
        lambda x: AddressMatch.levenshtein(user_norm, x)
    )

    # Normalise UPRN to string
    out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True)

    # Rank: 1 = best match
    out["lexirank"] = out["lexiscore"].rank(method="dense", ascending=False).astype(int)

    return out.sort_values(
        ["lexirank", "lexiscore"],
        ascending=[True, False],
    )


def get_uprn_with_epc_df(
    user_inputed_address: str,
    epc_df: pd.DataFrame,
    verbose: bool = False,
):
    """
    Return uprn (str) using a pre-fetched EPC dataframe.
    This avoids calling the API multiple times for the same postcode.
    """
    if epc_df.empty:
        return None

    scored_df = get_uprn_candidates(
        epc_df,
        user_address=user_inputed_address,
    )

    # Best score
    best_score = scored_df.iloc[0]["lexiscore"]

    # # Return None if score is below threshold
    # if best_score < 0.7:
    #     return None

    # All rank-1 rows (possible draw)
    top_rank_df = scored_df[scored_df["lexirank"] == 1]

    # If rank-1 rows do not agree on a single UPRN → ambiguous
    if not df_has_single_uprn(top_rank_df, uprn=top_rank_df.iloc[0]["uprn"]):
        return None

    address = top_rank_df["address"].values[0]
    score = float(top_rank_df["lexiscore"].values[0])

    logger.info(f"Address found to be: {address}, with lexiscore {score}")
    # Safe to return the agreed UPRN
    found_uprn = top_rank_df.iloc[0]["uprn"]

    if found_uprn == "":
        return None

    if verbose:
        return (found_uprn, address, score)
    else:
        return found_uprn


def get_uprn(
    user_inputed_address: str,
    postcode: str,
    verbose: bool = False,
):
    """
    Return uprn (str)
    Return False if failed to find a sensible matching epc
    Return None when epc found but no UPRN

    This function fetches EPC data via API for a single postcode.
    For processing multiple addresses in the same postcode, use get_uprn_with_epc_df instead.
    """
    df = get_epc_data_with_postcode(postcode=postcode)

    return get_uprn_with_epc_df(
        user_inputed_address=user_inputed_address,
        epc_df=df,
        verbose=verbose,
    )


def resolve_uprns_for_postcode_group(
    group_df: pd.DataFrame,
    epc_df: pd.DataFrame,
    address_col: str = "Address 1",
) -> pd.DataFrame:
    """
    Given:
      - group_df: rows sharing the same postcode
      - epc_df: EPC search results for that postcode

    Returns:
      group_df + found_uprn + diagnostics
    """

    results = []

    for _, row in group_df.iterrows():
        user_address = str(row[address_col]).strip()

        scored_df = get_uprn_candidates(
            epc_df,
            user_address=user_address,
        )

        if scored_df.empty:
            results.append(
                {
                    "found_uprn": None,
                    "best_match_uprn": None,
                    "best_match_address": None,
                    "best_match_lexiscore": None,
                    "status": "no_epc_candidates",
                }
            )
            continue

        best_score = scored_df.iloc[0]["lexiscore"]

        if best_score <= 0:
            results.append(
                {
                    "found_uprn": None,
                    "best_match_uprn": None,
                    "best_match_address": None,
                    "best_match_lexiscore": best_score,
                    "status": "zero_score",
                }
            )
            continue

        top_rank_df = scored_df[scored_df["lexirank"] == 1]

        if not df_has_single_uprn(top_rank_df, top_rank_df.iloc[0]["uprn"]):
            results.append(
                {
                    "found_uprn": None,
                    "best_match_uprn": top_rank_df.iloc[0]["uprn"],
                    "best_match_address": top_rank_df.iloc[0]["address"],
                    "best_match_lexiscore": best_score,
                    "status": "ambiguous",
                }
            )
            continue

        results.append(
            {
                "found_uprn": str(top_rank_df.iloc[0]["uprn"]),
                "best_match_uprn": str(top_rank_df.iloc[0]["uprn"]),
                "best_match_address": top_rank_df.iloc[0]["address"],
                "best_match_lexiscore": best_score,
                "status": "matched",
            }
        )

    return pd.concat(
        [group_df.reset_index(drop=True), pd.DataFrame(results)],
        axis=1,
    )


def save_results_to_s3(
    results_df: pd.DataFrame,
    task_id: str,
    sub_task_id: str,
    bucket_name: Optional[str] = None,
) -> bool:
    """
    Save results DataFrame to S3 as CSV.

    :param results_df: The DataFrame containing results
    :param task_id: The task ID (used for file naming)
    :param bucket_name: The S3 bucket name (defaults to env variable)
    :return: True if successful, False otherwise
    """
    if bucket_name is None:
        bucket_name = os.getenv("S3_BUCKET_NAME")

    if not bucket_name:
        logger.error(
            "S3 bucket name not provided and S3_BUCKET_NAME environment variable not set"
        )
        return False

    try:
        # Create a filename with the task ID
        file_name = f"{datetime.now().isoformat()}_{str(uuid.uuid4())[:8]}"
        file_key = f"ara_raw_outputs/{task_id}/{sub_task_id}/{file_name}.csv"

        # Save to S3
        success = save_csv_to_s3(results_df, bucket_name, file_key)

        if success:
            logger.info(f"Successfully saved results to s3://{bucket_name}/{file_key}")
            return True
        else:
            logger.error(f"Failed to save results to S3")
            return False

    except Exception as e:
        logger.error(f"Error saving results to S3: {str(e)}")
        return False


def handler(event, context, local=False):
    print("=== Address2UPRN Lambda Handler ===")
    print(f"Function: {context.function_name}")
    print(f"Request ID: {context.aws_request_id}")

    # Handle local testing
    if local is True:
        event = {
            "Records": [
                {
                    "body": json.dumps(
                        {
                            "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917",
                            "sub_task_id": "6a427b6e-1ece-4983-b1e5-9bffccc53d1d",
                            "s3_uri": "s3://retrofit-data-dev/ara_postcode_splitter_batches/e31f2f21-175b-4a91-a3ec-a6baa325e917/8673913b-1a88-42d7-8578-0449123d94b0/2026-02-18T11:47:00.822579_f95467f5.csv",
                        }
                    )
                }
            ]
        }

    print(f"Event: {json.dumps(event, indent=2, default=str)}")
    print("===================================")

    # Handle both single event and batch events (SQS, etc.)
    records = event.get("Records", [event])
    results = []
    errors = []
    subtask_interface = SubTaskInterface()

    for record in records:
        task_id = None
        subtask_id = None
        try:
            # Parse body (inputs)
            if isinstance(record.get("body"), str):
                body = json.loads(record["body"])
            else:
                body = record.get("body", {})

            # Validate required fields
            task_id = body.get("task_id")
            subtask_id = body.get("sub_task_id")
            s3_uri = body.get("s3_uri")

            if not task_id:
                errors.append({"error": "Missing required field: task_id"})
                continue

            if not subtask_id:
                errors.append({"error": "Missing required field: sub_task_id"})
                continue

            if not s3_uri:
                errors.append({"error": "Missing required field: s3_uri"})
                continue

            # Convert task_id to UUID
            try:
                task_id = UUID(task_id) if isinstance(task_id, str) else task_id
            except ValueError as e:
                errors.append({"error": f"Invalid UUID format for task_id: {str(e)}"})
                continue

            # Convert sub_task_id to UUID
            try:
                subtask_id = (
                    UUID(subtask_id) if isinstance(subtask_id, str) else subtask_id
                )
            except ValueError as e:
                errors.append(
                    {"error": f"Invalid UUID format for sub_task_id: {str(e)}"}
                )
                continue

            # Update existing subtask to 'in progress'
            subtask_interface.update_subtask_status(subtask_id, "in progress")
            logger.info(f"Processing subtask {subtask_id} for task {task_id}")

            # Parse S3 URI and read CSV from S3
            logger.info(f"Reading data from S3: {s3_uri}")
            try:
                bucket, key = parse_s3_uri(s3_uri)
                csv_data = read_csv_from_s3_dict(bucket, key)
                df = pd.DataFrame(csv_data)
                logger.info(f"Loaded {len(df)} rows from S3")
            except Exception as s3_error:
                logger.error(f"Failed to read data from S3: {s3_error}")
                errors.append(
                    {"error": "Failed to read data from S3", "details": str(s3_error)}
                )
                try:
                    subtask_interface.update_subtask_status(
                        subtask_id, "failed", outputs={"error": str(s3_error)}
                    )
                except Exception as db_error:
                    logger.error(f"Failed to update subtask status: {db_error}")
                continue

            # Process the rows
            logger.info(f"Processing {len(df)} rows for task {task_id}")

            clean_df = df.dropna(subset=["postcode_clean"])

            postcode_to_addresses = {
                postcode: group.to_dict(orient="records")
                for postcode, group in clean_df.groupby("postcode_clean", sort=False)
            }

            logger.info(f"Total postcodes: {len(postcode_to_addresses)}")

            # Process each postcode group

            results_data = []

            for postcode, postcode_rows in postcode_to_addresses.items():
                logger.info(
                    f"Processing postcode: {postcode} with {len(postcode_rows)} rows"
                )

                # Validate postcode before processing
                if not AddressMatch.is_valid_postcode(postcode):
                    logger.warning(f"Postcode {postcode} is invalid, skipping")
                    continue

                # Fetch EPC data once per postcode
                try:
                    epc_df = get_epc_data_with_postcode(postcode=postcode)
                    logger.info(
                        f"Fetched {len(epc_df)} EPC records for postcode {postcode}"
                    )
                except Exception as e:
                    logger.error(
                        f"Failed to fetch EPC data for postcode {postcode}: {e}"
                    )
                    continue

                # Process each address in this postcode with the same EPC data
                for row in postcode_rows:
                    try:
                        # Concatenate Address columns directly
                        address2uprn_user_input = (
                            str(row.get("Address 1", "")).strip()
                            + " "
                            + str(row.get("Address 2", "")).strip()
                            + " "
                            + str(row.get("Address 3", "")).strip()
                        ).strip()

                        if not address2uprn_user_input:
                            logger.warning(
                                f"Skipping row with missing address components for postcode {postcode}"
                            )
                            continue

                        # Get UPRN using the pre-fetched EPC data with all return options
                        result = get_uprn_with_epc_df(
                            user_inputed_address=address2uprn_user_input,
                            epc_df=epc_df,
                            verbose=True,
                        )

                        # Parse result tuple if successful
                        if result:
                            uprn, found_address, score = result
                            logger.info(
                                f"Found UPRN for {address2uprn_user_input} in {postcode}: {uprn} (score: {score})"
                            )

                            results_data.append(
                                {
                                    **row,  # Include all original data
                                    "address2uprn_uprn": uprn,
                                    "address2uprn_address": found_address,
                                    "address2uprn_lexiscore": score,
                                }
                            )
                        else:
                            logger.warning(
                                f"No UPRN found for {address2uprn_user_input} in {postcode}"
                            )
                            results_data.append(
                                {
                                    **row,  # Include all original data
                                    "address2uprn_uprn": None,
                                    "address2uprn_address": None,
                                    "address2uprn_lexiscore": None,
                                }
                            )

                    except Exception as e:
                        logger.error(
                            f"Error processing address {row.get('address2uprn_user_input', 'unknown')}: {e}"
                        )
                        # Still add the row with error markers
                        results_data.append(
                            {
                                **row,
                                "address2uprn_uprn": None,
                                "address2uprn_address": None,
                                "address2uprn_lexiscore": None,
                                "error": str(e),
                            }
                        )
                        continue

            # Create results DataFrame
            result_df = pd.DataFrame(results_data)

            # Save results to S3
            try:
                save_results_to_s3(result_df, str(task_id), str(subtask_id))
            except Exception as s3_error:
                logger.error(f"Failed to save results to S3: {s3_error}")

            # Mark subtask as completed
            try:
                subtask_interface.update_subtask_status(
                    subtask_id,
                    "completed",
                    outputs={"rows_processed": "todo -> show sensible output"},
                )
                logger.info(f"Marked subtask {subtask_id} as completed")
            except Exception as db_error:
                logger.error(f"Failed to mark subtask as completed: {db_error}")

        except Exception as e:
            logger.error(f"Unexpected error processing record: {e}", exc_info=True)
            errors.append({"error": "Unexpected error", "details": str(e)})
            # Mark subtask as failed if we have one
            if subtask_id:
                try:
                    subtask_interface.update_subtask_status(
                        subtask_id, "failed", outputs={"error": str(e)}
                    )
                except Exception as db_error:
                    logger.error(f"Failed to update subtask status: {db_error}")

    # Return error if all records failed
    logger.info(results_data)
    logger.info(results)
    if errors and not results:
        return {"statusCode": 500, "body": json.dumps({"errors": errors})}

    return {
        "statusCode": 200,
        "body": json.dumps(
            {"processed": results, "errors": errors if errors else None}
        ),
    }


# TODO:
# Don't add results to return messages as its too verbose
# capture the exepection as e, into s3, to find the logs go to s3
# Upload results to s3 as well as csv