mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
save correct progress
This commit is contained in:
parent
c833a3c91b
commit
8baa4c82aa
5 changed files with 91 additions and 11 deletions
|
|
@ -7,6 +7,7 @@ from infrastructure.csv_s3_client import CsvS3Client
|
|||
from repositories.user_address.user_address_csv_s3_repository import (
|
||||
UserAddressCsvS3Repository,
|
||||
)
|
||||
from domain.addresses.user_address import UserAddress
|
||||
|
||||
|
||||
def handler(
|
||||
|
|
@ -31,11 +32,13 @@ def handler(
|
|||
user_address_repo=user_address_repo,
|
||||
)
|
||||
|
||||
list_of_user_address = orchestrator.get_user_address(input_s3_uri=s3_uri)
|
||||
list_of_user_address: list[UserAddress] = orchestrator.get_user_address(
|
||||
input_s3_uri=s3_uri
|
||||
)
|
||||
|
||||
for each_user_address in list_of_user_address:
|
||||
print(each_user_address.landlord_additional_info.keys())
|
||||
break
|
||||
col_to_desc_map = orchestrator.get_col_to_description_mappings(
|
||||
list_of_user_address=list_of_user_address
|
||||
)
|
||||
|
||||
# Read csv of user input
|
||||
# get the column and unique variations of each description
|
||||
|
|
|
|||
|
|
@ -5,6 +5,30 @@ from infrastructure.s3_client import S3Client
|
|||
from infrastructure.s3_uri import parse_s3_uri
|
||||
|
||||
|
||||
def _dedupe_fieldnames(fieldnames: list[str]) -> list[str]:
|
||||
"""Disambiguate repeated CSV headers by appending an index.
|
||||
|
||||
The first occurrence keeps its name; each later one becomes
|
||||
``name_1``, ``name_2`` … so duplicate columns survive as distinct
|
||||
keys instead of collapsing onto one (last-wins) dict entry.
|
||||
"""
|
||||
deduped: list[str] = []
|
||||
counts: dict[str, int] = {}
|
||||
for name in fieldnames:
|
||||
if name not in counts:
|
||||
counts[name] = 0
|
||||
deduped.append(name)
|
||||
continue
|
||||
counts[name] += 1
|
||||
candidate = f"{name}_{counts[name]}"
|
||||
while candidate in counts:
|
||||
counts[name] += 1
|
||||
candidate = f"{name}_{counts[name]}"
|
||||
counts[candidate] = 0
|
||||
deduped.append(candidate)
|
||||
return deduped
|
||||
|
||||
|
||||
class CsvS3Client(S3Client):
|
||||
def read_rows(self, s3_uri: str) -> list[dict[str, str]]:
|
||||
bucket, key = parse_s3_uri(s3_uri)
|
||||
|
|
@ -19,7 +43,12 @@ class CsvS3Client(S3Client):
|
|||
# Some uploads are Windows-1252 (e.g. £ as byte 0xA3), not UTF-8.
|
||||
text = raw.decode("cp1252")
|
||||
|
||||
reader = csv.DictReader(StringIO(text))
|
||||
buffer = StringIO(text)
|
||||
header = next(csv.reader(buffer), None)
|
||||
if header is None:
|
||||
return []
|
||||
fieldnames = _dedupe_fieldnames(header)
|
||||
reader = csv.DictReader(buffer, fieldnames=fieldnames)
|
||||
return [dict(row) for row in reader]
|
||||
|
||||
def save_rows(self, rows: list[dict[str, str]], key: str) -> str:
|
||||
|
|
|
|||
|
|
@ -14,9 +14,9 @@ class LandlordDescriptionOverridesOrchestrator:
|
|||
|
||||
def get_col_to_description_mappings(
|
||||
self, list_of_user_address: list[UserAddress]
|
||||
) -> dict[str, list[str]]:
|
||||
mappings: dict[str, list[str]] = {}
|
||||
) -> dict[str, set[str]]:
|
||||
mappings: dict[str, set[str]] = {}
|
||||
for user_address in list_of_user_address:
|
||||
for key, value in user_address.landlord_additional_info.items():
|
||||
mappings.setdefault(key, []).append(value)
|
||||
mappings.setdefault(key, set()).add(value)
|
||||
return mappings
|
||||
|
|
|
|||
|
|
@ -49,3 +49,36 @@ def test_read_rows_rejects_wrong_bucket(csv_client: CsvS3Client) -> None:
|
|||
# act / assert
|
||||
with pytest.raises(ValueError, match="does not match client bucket"):
|
||||
csv_client.read_rows("s3://other-bucket/uploads/addresses.csv")
|
||||
|
||||
|
||||
def test_read_rows_indexes_duplicate_column_names(csv_client: CsvS3Client) -> None:
|
||||
# arrange: the Hyde export has two columns both headed "Walls" — a
|
||||
# description and a score. Without disambiguation csv.DictReader would
|
||||
# collapse them onto one key and the description would be lost.
|
||||
raw = "Address 1,Walls,Roofs,Walls\n1 High St,Cavity: Filled,Pitched 300mm,9.6\n"
|
||||
uri = csv_client.put_object("uploads/dup.csv", raw.encode("utf-8"))
|
||||
|
||||
# act
|
||||
rows = csv_client.read_rows(uri)
|
||||
|
||||
# assert: the first occurrence keeps its name, the second gets an index.
|
||||
assert rows == [
|
||||
{
|
||||
"Address 1": "1 High St",
|
||||
"Walls": "Cavity: Filled",
|
||||
"Roofs": "Pitched 300mm",
|
||||
"Walls_1": "9.6",
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def test_read_rows_indexes_each_repeat_of_a_column(csv_client: CsvS3Client) -> None:
|
||||
# arrange: three columns sharing one header.
|
||||
raw = "Walls,Walls,Walls\nfirst,second,third\n"
|
||||
uri = csv_client.put_object("uploads/triple.csv", raw.encode("utf-8"))
|
||||
|
||||
# act
|
||||
rows = csv_client.read_rows(uri)
|
||||
|
||||
# assert
|
||||
assert rows == [{"Walls": "first", "Walls_1": "second", "Walls_2": "third"}]
|
||||
|
|
|
|||
|
|
@ -45,11 +45,26 @@ def test_collects_every_value_per_shared_key() -> None:
|
|||
|
||||
# assert
|
||||
assert mappings == {
|
||||
"description": ["cosy", "spacious", "bright"],
|
||||
"condition": ["new", "worn", "fair"],
|
||||
"description": {"cosy", "spacious", "bright"},
|
||||
"condition": {"new", "worn", "fair"},
|
||||
}
|
||||
|
||||
|
||||
def test_repeated_values_collapse_to_one_variant() -> None:
|
||||
# arrange: two addresses share the same wall description.
|
||||
addresses = [
|
||||
_make_user_address({"description": "cosy"}),
|
||||
_make_user_address({"description": "cosy"}),
|
||||
_make_user_address({"description": "bright"}),
|
||||
]
|
||||
|
||||
# act
|
||||
mappings = _orchestrator().get_col_to_description_mappings(addresses)
|
||||
|
||||
# assert: a set keeps one entry per distinct variant.
|
||||
assert mappings == {"description": {"cosy", "bright"}}
|
||||
|
||||
|
||||
def test_empty_address_list_yields_empty_mapping() -> None:
|
||||
# arrange / act
|
||||
mappings = _orchestrator().get_col_to_description_mappings([])
|
||||
|
|
@ -66,4 +81,4 @@ def test_single_address_yields_single_value_per_key() -> None:
|
|||
mappings = _orchestrator().get_col_to_description_mappings(addresses)
|
||||
|
||||
# assert
|
||||
assert mappings == {"description": ["cosy"]}
|
||||
assert mappings == {"description": {"cosy"}}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue