save correct progress

This commit is contained in:
Jun-te Kim 2026-05-21 16:57:14 +00:00
parent c833a3c91b
commit 8baa4c82aa
5 changed files with 91 additions and 11 deletions

View file

@ -7,6 +7,7 @@ from infrastructure.csv_s3_client import CsvS3Client
from repositories.user_address.user_address_csv_s3_repository import (
UserAddressCsvS3Repository,
)
from domain.addresses.user_address import UserAddress
def handler(
@ -31,11 +32,13 @@ def handler(
user_address_repo=user_address_repo,
)
list_of_user_address = orchestrator.get_user_address(input_s3_uri=s3_uri)
list_of_user_address: list[UserAddress] = orchestrator.get_user_address(
input_s3_uri=s3_uri
)
for each_user_address in list_of_user_address:
print(each_user_address.landlord_additional_info.keys())
break
col_to_desc_map = orchestrator.get_col_to_description_mappings(
list_of_user_address=list_of_user_address
)
# Read csv of user input
# get the column and unique variations of each description

View file

@ -5,6 +5,30 @@ from infrastructure.s3_client import S3Client
from infrastructure.s3_uri import parse_s3_uri
def _dedupe_fieldnames(fieldnames: list[str]) -> list[str]:
"""Disambiguate repeated CSV headers by appending an index.
The first occurrence keeps its name; each later one becomes
``name_1``, ``name_2`` so duplicate columns survive as distinct
keys instead of collapsing onto one (last-wins) dict entry.
"""
deduped: list[str] = []
counts: dict[str, int] = {}
for name in fieldnames:
if name not in counts:
counts[name] = 0
deduped.append(name)
continue
counts[name] += 1
candidate = f"{name}_{counts[name]}"
while candidate in counts:
counts[name] += 1
candidate = f"{name}_{counts[name]}"
counts[candidate] = 0
deduped.append(candidate)
return deduped
class CsvS3Client(S3Client):
def read_rows(self, s3_uri: str) -> list[dict[str, str]]:
bucket, key = parse_s3_uri(s3_uri)
@ -19,7 +43,12 @@ class CsvS3Client(S3Client):
# Some uploads are Windows-1252 (e.g. £ as byte 0xA3), not UTF-8.
text = raw.decode("cp1252")
reader = csv.DictReader(StringIO(text))
buffer = StringIO(text)
header = next(csv.reader(buffer), None)
if header is None:
return []
fieldnames = _dedupe_fieldnames(header)
reader = csv.DictReader(buffer, fieldnames=fieldnames)
return [dict(row) for row in reader]
def save_rows(self, rows: list[dict[str, str]], key: str) -> str:

View file

@ -14,9 +14,9 @@ class LandlordDescriptionOverridesOrchestrator:
def get_col_to_description_mappings(
self, list_of_user_address: list[UserAddress]
) -> dict[str, list[str]]:
mappings: dict[str, list[str]] = {}
) -> dict[str, set[str]]:
mappings: dict[str, set[str]] = {}
for user_address in list_of_user_address:
for key, value in user_address.landlord_additional_info.items():
mappings.setdefault(key, []).append(value)
mappings.setdefault(key, set()).add(value)
return mappings

View file

@ -49,3 +49,36 @@ def test_read_rows_rejects_wrong_bucket(csv_client: CsvS3Client) -> None:
# act / assert
with pytest.raises(ValueError, match="does not match client bucket"):
csv_client.read_rows("s3://other-bucket/uploads/addresses.csv")
def test_read_rows_indexes_duplicate_column_names(csv_client: CsvS3Client) -> None:
# arrange: the Hyde export has two columns both headed "Walls" — a
# description and a score. Without disambiguation csv.DictReader would
# collapse them onto one key and the description would be lost.
raw = "Address 1,Walls,Roofs,Walls\n1 High St,Cavity: Filled,Pitched 300mm,9.6\n"
uri = csv_client.put_object("uploads/dup.csv", raw.encode("utf-8"))
# act
rows = csv_client.read_rows(uri)
# assert: the first occurrence keeps its name, the second gets an index.
assert rows == [
{
"Address 1": "1 High St",
"Walls": "Cavity: Filled",
"Roofs": "Pitched 300mm",
"Walls_1": "9.6",
}
]
def test_read_rows_indexes_each_repeat_of_a_column(csv_client: CsvS3Client) -> None:
# arrange: three columns sharing one header.
raw = "Walls,Walls,Walls\nfirst,second,third\n"
uri = csv_client.put_object("uploads/triple.csv", raw.encode("utf-8"))
# act
rows = csv_client.read_rows(uri)
# assert
assert rows == [{"Walls": "first", "Walls_1": "second", "Walls_2": "third"}]

View file

@ -45,11 +45,26 @@ def test_collects_every_value_per_shared_key() -> None:
# assert
assert mappings == {
"description": ["cosy", "spacious", "bright"],
"condition": ["new", "worn", "fair"],
"description": {"cosy", "spacious", "bright"},
"condition": {"new", "worn", "fair"},
}
def test_repeated_values_collapse_to_one_variant() -> None:
# arrange: two addresses share the same wall description.
addresses = [
_make_user_address({"description": "cosy"}),
_make_user_address({"description": "cosy"}),
_make_user_address({"description": "bright"}),
]
# act
mappings = _orchestrator().get_col_to_description_mappings(addresses)
# assert: a set keeps one entry per distinct variant.
assert mappings == {"description": {"cosy", "bright"}}
def test_empty_address_list_yields_empty_mapping() -> None:
# arrange / act
mappings = _orchestrator().get_col_to_description_mappings([])
@ -66,4 +81,4 @@ def test_single_address_yields_single_value_per_key() -> None:
mappings = _orchestrator().get_col_to_description_mappings(addresses)
# assert
assert mappings == {"description": ["cosy"]}
assert mappings == {"description": {"cosy"}}