Model/tests/infrastructure/test_csv_s3_client.py

from collections.abc import Iterator

import pytest
from moto import mock_aws

from infrastructure.s3.csv_s3_client import CsvS3Client
from tests.infrastructure import make_boto_client

BUCKET = "csv-bucket"


@pytest.fixture
def csv_client() -> Iterator[CsvS3Client]:
    with mock_aws():
        boto_client = make_boto_client("s3")
        boto_client.create_bucket(Bucket=BUCKET)
        yield CsvS3Client(boto_client, BUCKET)


def test_save_rows_returns_s3_uri(csv_client: CsvS3Client) -> None:
    # arrange
    rows = [{"address": "1 High St", "postcode": "AB1 2CD"}]
    # act
    uri = csv_client.save_rows(rows, "uploads/addresses.csv")
    # assert
    assert uri == f"s3://{BUCKET}/uploads/addresses.csv"


def test_round_trip_preserves_rows(csv_client: CsvS3Client) -> None:
    # arrange
    rows = [
        {"address": "1 High St", "postcode": "AB1 2CD"},
        {"address": "2 Low St", "postcode": "XY9 8ZW"},
    ]
    # act
    uri = csv_client.save_rows(rows, "uploads/addresses.csv")
    fetched = csv_client.read_rows(uri)
    # assert
    assert fetched == rows


def test_save_rows_rejects_empty_list(csv_client: CsvS3Client) -> None:
    # act / assert
    with pytest.raises(ValueError, match="empty"):
        csv_client.save_rows([], "uploads/empty.csv")


def test_read_rows_rejects_wrong_bucket(csv_client: CsvS3Client) -> None:
    # act / assert
    with pytest.raises(ValueError, match="does not match client bucket"):
        csv_client.read_rows("s3://other-bucket/uploads/addresses.csv")


def test_read_rows_indexes_duplicate_column_names(csv_client: CsvS3Client) -> None:
    # arrange: the Hyde export has two columns both headed "Walls" — a
    # description and a score. Without disambiguation csv.DictReader would
    # collapse them onto one key and the description would be lost.
    raw = "Address 1,Walls,Roofs,Walls\n1 High St,Cavity: Filled,Pitched 300mm,9.6\n"
    uri = csv_client.put_object("uploads/dup.csv", raw.encode("utf-8"))

    # act
    rows = csv_client.read_rows(uri)

    # assert: the first occurrence keeps its name, the second gets an index.
    assert rows == [
        {
            "Address 1": "1 High St",
            "Walls": "Cavity: Filled",
            "Roofs": "Pitched 300mm",
            "Walls_1": "9.6",
        }
    ]


def test_read_rows_indexes_each_repeat_of_a_column(csv_client: CsvS3Client) -> None:
    # arrange: three columns sharing one header.
    raw = "Walls,Walls,Walls\nfirst,second,third\n"
    uri = csv_client.put_object("uploads/triple.csv", raw.encode("utf-8"))

    # act
    rows = csv_client.read_rows(uri)

    # assert
    assert rows == [{"Walls": "first", "Walls_1": "second", "Walls_2": "third"}]