save correct progress

2026-07-27 23:35:01 +00:00 · 2026-05-21 16:57:14 +00:00 · 2026-05-21 16:57:14 +00:00 · 8baa4c82aa
commit 8baa4c82aa
parent c833a3c91b
5 changed files with 91 additions and 11 deletions
--- a/applications/landlord_description_overrides/handler.py
+++ b/applications/landlord_description_overrides/handler.py
@ -7,6 +7,7 @@ from infrastructure.csv_s3_client import CsvS3Client
 from repositories.user_address.user_address_csv_s3_repository import (
    UserAddressCsvS3Repository,
 )
+from domain.addresses.user_address import UserAddress


 def handler(
@ -31,11 +32,13 @@ def handler(
        user_address_repo=user_address_repo,
    )

-    list_of_user_address = orchestrator.get_user_address(input_s3_uri=s3_uri)
+    list_of_user_address: list[UserAddress] = orchestrator.get_user_address(
+        input_s3_uri=s3_uri
+    )

-    for each_user_address in list_of_user_address:
-        print(each_user_address.landlord_additional_info.keys())
-        break
+    col_to_desc_map = orchestrator.get_col_to_description_mappings(
+        list_of_user_address=list_of_user_address
+    )

    # Read csv of user input
    # get the column and unique variations of each description
--- a/infrastructure/csv_s3_client.py
+++ b/infrastructure/csv_s3_client.py
@ -5,6 +5,30 @@ from infrastructure.s3_client import S3Client
 from infrastructure.s3_uri import parse_s3_uri


+def _dedupe_fieldnames(fieldnames: list[str]) -> list[str]:
+    """Disambiguate repeated CSV headers by appending an index.
+
+    The first occurrence keeps its name; each later one becomes
+    ``name_1``, ``name_2`` … so duplicate columns survive as distinct
+    keys instead of collapsing onto one (last-wins) dict entry.
+    """
+    deduped: list[str] = []
+    counts: dict[str, int] = {}
+    for name in fieldnames:
+        if name not in counts:
+            counts[name] = 0
+            deduped.append(name)
+            continue
+        counts[name] += 1
+        candidate = f"{name}_{counts[name]}"
+        while candidate in counts:
+            counts[name] += 1
+            candidate = f"{name}_{counts[name]}"
+        counts[candidate] = 0
+        deduped.append(candidate)
+    return deduped
+
+
 class CsvS3Client(S3Client):
    def read_rows(self, s3_uri: str) -> list[dict[str, str]]:
        bucket, key = parse_s3_uri(s3_uri)
@ -19,7 +43,12 @@ class CsvS3Client(S3Client):
            # Some uploads are Windows-1252 (e.g. £ as byte 0xA3), not UTF-8.
            text = raw.decode("cp1252")

-        reader = csv.DictReader(StringIO(text))
+        buffer = StringIO(text)
+        header = next(csv.reader(buffer), None)
+        if header is None:
+            return []
+        fieldnames = _dedupe_fieldnames(header)
+        reader = csv.DictReader(buffer, fieldnames=fieldnames)
        return [dict(row) for row in reader]

    def save_rows(self, rows: list[dict[str, str]], key: str) -> str:
--- a/orchestration/landlord_description_overrides_orchestrator.py
+++ b/orchestration/landlord_description_overrides_orchestrator.py
@ -14,9 +14,9 @@ class LandlordDescriptionOverridesOrchestrator:

    def get_col_to_description_mappings(
        self, list_of_user_address: list[UserAddress]
-    ) -> dict[str, list[str]]:
-        mappings: dict[str, list[str]] = {}
+    ) -> dict[str, set[str]]:
+        mappings: dict[str, set[str]] = {}
        for user_address in list_of_user_address:
            for key, value in user_address.landlord_additional_info.items():
-                mappings.setdefault(key, []).append(value)
+                mappings.setdefault(key, set()).add(value)
        return mappings
--- a/tests/infrastructure/test_csv_s3_client.py
+++ b/tests/infrastructure/test_csv_s3_client.py
@ -49,3 +49,36 @@ def test_read_rows_rejects_wrong_bucket(csv_client: CsvS3Client) -> None:
    # act / assert
    with pytest.raises(ValueError, match="does not match client bucket"):
        csv_client.read_rows("s3://other-bucket/uploads/addresses.csv")
+
+
+def test_read_rows_indexes_duplicate_column_names(csv_client: CsvS3Client) -> None:
+    # arrange: the Hyde export has two columns both headed "Walls" — a
+    # description and a score. Without disambiguation csv.DictReader would
+    # collapse them onto one key and the description would be lost.
+    raw = "Address 1,Walls,Roofs,Walls\n1 High St,Cavity: Filled,Pitched 300mm,9.6\n"
+    uri = csv_client.put_object("uploads/dup.csv", raw.encode("utf-8"))
+
+    # act
+    rows = csv_client.read_rows(uri)
+
+    # assert: the first occurrence keeps its name, the second gets an index.
+    assert rows == [
+        {
+            "Address 1": "1 High St",
+            "Walls": "Cavity: Filled",
+            "Roofs": "Pitched 300mm",
+            "Walls_1": "9.6",
+        }
+    ]
+
+
+def test_read_rows_indexes_each_repeat_of_a_column(csv_client: CsvS3Client) -> None:
+    # arrange: three columns sharing one header.
+    raw = "Walls,Walls,Walls\nfirst,second,third\n"
+    uri = csv_client.put_object("uploads/triple.csv", raw.encode("utf-8"))
+
+    # act
+    rows = csv_client.read_rows(uri)
+
+    # assert
+    assert rows == [{"Walls": "first", "Walls_1": "second", "Walls_2": "third"}]
--- a/tests/orchestration/test_landlord_description_overrides_orchestrator.py
+++ b/tests/orchestration/test_landlord_description_overrides_orchestrator.py
@ -45,11 +45,26 @@ def test_collects_every_value_per_shared_key() -> None:

    # assert
    assert mappings == {
-        "description": ["cosy", "spacious", "bright"],
-        "condition": ["new", "worn", "fair"],
+        "description": {"cosy", "spacious", "bright"},
+        "condition": {"new", "worn", "fair"},
    }


+def test_repeated_values_collapse_to_one_variant() -> None:
+    # arrange: two addresses share the same wall description.
+    addresses = [
+        _make_user_address({"description": "cosy"}),
+        _make_user_address({"description": "cosy"}),
+        _make_user_address({"description": "bright"}),
+    ]
+
+    # act
+    mappings = _orchestrator().get_col_to_description_mappings(addresses)
+
+    # assert: a set keeps one entry per distinct variant.
+    assert mappings == {"description": {"cosy", "bright"}}
+
+
 def test_empty_address_list_yields_empty_mapping() -> None:
    # arrange / act
    mappings = _orchestrator().get_col_to_description_mappings([])
@ -66,4 +81,4 @@ def test_single_address_yields_single_value_per_key() -> None:
    mappings = _orchestrator().get_col_to_description_mappings(addresses)

    # assert
-    assert mappings == {"description": ["cosy"]}
+    assert mappings == {"description": {"cosy"}}