diff --git a/applications/landlord_description_overrides/handler.py b/applications/landlord_description_overrides/handler.py index 003bd4d3..65297dac 100644 --- a/applications/landlord_description_overrides/handler.py +++ b/applications/landlord_description_overrides/handler.py @@ -7,6 +7,7 @@ from infrastructure.csv_s3_client import CsvS3Client from repositories.user_address.user_address_csv_s3_repository import ( UserAddressCsvS3Repository, ) +from domain.addresses.user_address import UserAddress def handler( @@ -31,11 +32,13 @@ def handler( user_address_repo=user_address_repo, ) - list_of_user_address = orchestrator.get_user_address(input_s3_uri=s3_uri) + list_of_user_address: list[UserAddress] = orchestrator.get_user_address( + input_s3_uri=s3_uri + ) - for each_user_address in list_of_user_address: - print(each_user_address.landlord_additional_info.keys()) - break + col_to_desc_map = orchestrator.get_col_to_description_mappings( + list_of_user_address=list_of_user_address + ) # Read csv of user input # get the column and unique variations of each description diff --git a/infrastructure/csv_s3_client.py b/infrastructure/csv_s3_client.py index 8af8de73..d058ba53 100644 --- a/infrastructure/csv_s3_client.py +++ b/infrastructure/csv_s3_client.py @@ -5,6 +5,30 @@ from infrastructure.s3_client import S3Client from infrastructure.s3_uri import parse_s3_uri +def _dedupe_fieldnames(fieldnames: list[str]) -> list[str]: + """Disambiguate repeated CSV headers by appending an index. + + The first occurrence keeps its name; each later one becomes + ``name_1``, ``name_2`` … so duplicate columns survive as distinct + keys instead of collapsing onto one (last-wins) dict entry. + """ + deduped: list[str] = [] + counts: dict[str, int] = {} + for name in fieldnames: + if name not in counts: + counts[name] = 0 + deduped.append(name) + continue + counts[name] += 1 + candidate = f"{name}_{counts[name]}" + while candidate in counts: + counts[name] += 1 + candidate = f"{name}_{counts[name]}" + counts[candidate] = 0 + deduped.append(candidate) + return deduped + + class CsvS3Client(S3Client): def read_rows(self, s3_uri: str) -> list[dict[str, str]]: bucket, key = parse_s3_uri(s3_uri) @@ -19,7 +43,12 @@ class CsvS3Client(S3Client): # Some uploads are Windows-1252 (e.g. £ as byte 0xA3), not UTF-8. text = raw.decode("cp1252") - reader = csv.DictReader(StringIO(text)) + buffer = StringIO(text) + header = next(csv.reader(buffer), None) + if header is None: + return [] + fieldnames = _dedupe_fieldnames(header) + reader = csv.DictReader(buffer, fieldnames=fieldnames) return [dict(row) for row in reader] def save_rows(self, rows: list[dict[str, str]], key: str) -> str: diff --git a/orchestration/landlord_description_overrides_orchestrator.py b/orchestration/landlord_description_overrides_orchestrator.py index 0751975a..7f3c3396 100644 --- a/orchestration/landlord_description_overrides_orchestrator.py +++ b/orchestration/landlord_description_overrides_orchestrator.py @@ -14,9 +14,9 @@ class LandlordDescriptionOverridesOrchestrator: def get_col_to_description_mappings( self, list_of_user_address: list[UserAddress] - ) -> dict[str, list[str]]: - mappings: dict[str, list[str]] = {} + ) -> dict[str, set[str]]: + mappings: dict[str, set[str]] = {} for user_address in list_of_user_address: for key, value in user_address.landlord_additional_info.items(): - mappings.setdefault(key, []).append(value) + mappings.setdefault(key, set()).add(value) return mappings diff --git a/tests/infrastructure/test_csv_s3_client.py b/tests/infrastructure/test_csv_s3_client.py index 30e27164..e7ec7eab 100644 --- a/tests/infrastructure/test_csv_s3_client.py +++ b/tests/infrastructure/test_csv_s3_client.py @@ -49,3 +49,36 @@ def test_read_rows_rejects_wrong_bucket(csv_client: CsvS3Client) -> None: # act / assert with pytest.raises(ValueError, match="does not match client bucket"): csv_client.read_rows("s3://other-bucket/uploads/addresses.csv") + + +def test_read_rows_indexes_duplicate_column_names(csv_client: CsvS3Client) -> None: + # arrange: the Hyde export has two columns both headed "Walls" — a + # description and a score. Without disambiguation csv.DictReader would + # collapse them onto one key and the description would be lost. + raw = "Address 1,Walls,Roofs,Walls\n1 High St,Cavity: Filled,Pitched 300mm,9.6\n" + uri = csv_client.put_object("uploads/dup.csv", raw.encode("utf-8")) + + # act + rows = csv_client.read_rows(uri) + + # assert: the first occurrence keeps its name, the second gets an index. + assert rows == [ + { + "Address 1": "1 High St", + "Walls": "Cavity: Filled", + "Roofs": "Pitched 300mm", + "Walls_1": "9.6", + } + ] + + +def test_read_rows_indexes_each_repeat_of_a_column(csv_client: CsvS3Client) -> None: + # arrange: three columns sharing one header. + raw = "Walls,Walls,Walls\nfirst,second,third\n" + uri = csv_client.put_object("uploads/triple.csv", raw.encode("utf-8")) + + # act + rows = csv_client.read_rows(uri) + + # assert + assert rows == [{"Walls": "first", "Walls_1": "second", "Walls_2": "third"}] diff --git a/tests/orchestration/test_landlord_description_overrides_orchestrator.py b/tests/orchestration/test_landlord_description_overrides_orchestrator.py index 5660bf78..4f241423 100644 --- a/tests/orchestration/test_landlord_description_overrides_orchestrator.py +++ b/tests/orchestration/test_landlord_description_overrides_orchestrator.py @@ -45,11 +45,26 @@ def test_collects_every_value_per_shared_key() -> None: # assert assert mappings == { - "description": ["cosy", "spacious", "bright"], - "condition": ["new", "worn", "fair"], + "description": {"cosy", "spacious", "bright"}, + "condition": {"new", "worn", "fair"}, } +def test_repeated_values_collapse_to_one_variant() -> None: + # arrange: two addresses share the same wall description. + addresses = [ + _make_user_address({"description": "cosy"}), + _make_user_address({"description": "cosy"}), + _make_user_address({"description": "bright"}), + ] + + # act + mappings = _orchestrator().get_col_to_description_mappings(addresses) + + # assert: a set keeps one entry per distinct variant. + assert mappings == {"description": {"cosy", "bright"}} + + def test_empty_address_list_yields_empty_mapping() -> None: # arrange / act mappings = _orchestrator().get_col_to_description_mappings([]) @@ -66,4 +81,4 @@ def test_single_address_yields_single_value_per_key() -> None: mappings = _orchestrator().get_col_to_description_mappings(addresses) # assert - assert mappings == {"description": ["cosy"]} + assert mappings == {"description": {"cosy"}}