From fb758b76bf2dcecbed486b569b9fa5e345a85ddc Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 11 May 2026 08:37:44 +0000 Subject: [PATCH] changed to utils --- datatypes/epc/domain/historic_epc_matching.py | 16 +++------------- .../domain/tests/test_historic_epc_matching.py | 2 +- utils/pandas_utils.py | 14 ++++++++++++++ utils/s3.py | 2 -- 4 files changed, 18 insertions(+), 16 deletions(-) create mode 100644 utils/pandas_utils.py diff --git a/datatypes/epc/domain/historic_epc_matching.py b/datatypes/epc/domain/historic_epc_matching.py index 53f602ae..2eb590e8 100644 --- a/datatypes/epc/domain/historic_epc_matching.py +++ b/datatypes/epc/domain/historic_epc_matching.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import Any, Optional +from typing import Optional import pandas as pd from botocore.exceptions import ClientError @@ -7,6 +7,7 @@ from botocore.exceptions import ClientError from backend.address2UPRN.scoring import get_uprn_candidates from backend.utils.addressMatch import AddressMatch from datatypes.epc.domain.historic_epc import HistoricEpc +from utils.pandas_utils import pandas_cell_to_str from utils.s3 import parse_s3_uri, read_csv_gz_from_s3 DEFAULT_S3_ROOT = "s3://retrofit-data-dev/historical_epc" @@ -14,20 +15,9 @@ DEFAULT_S3_ROOT = "s3://retrofit-data-dev/historical_epc" _EXTRA_COLS = {"lexiscore", "lexirank"} -def _cell_to_str(v: Any) -> str: - if v is None or (isinstance(v, float) and pd.isna(v)): - return "" - s = str(v).replace("\xa0", " ") - # get_uprn_candidates runs .astype(str) on UPRN, turning NaN into "nan". - # Treat that as missing so unambiguous_uprn truthiness checks work. - if s.lower() == "nan": - return "" - return s - - def _row_to_historic_epc(row: pd.Series) -> HistoricEpc: kwargs = { - col.lower(): _cell_to_str(val) + col.lower(): pandas_cell_to_str(val) for col, val in row.items() if col.lower() not in _EXTRA_COLS } diff --git a/datatypes/epc/domain/tests/test_historic_epc_matching.py b/datatypes/epc/domain/tests/test_historic_epc_matching.py index c23846e1..1c3ee6d4 100644 --- a/datatypes/epc/domain/tests/test_historic_epc_matching.py +++ b/datatypes/epc/domain/tests/test_historic_epc_matching.py @@ -211,7 +211,7 @@ class TestUnambiguousUprn: ]) result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") top = result.top() - # _cell_to_str must turn NaN/"nan" into "" (not the literal string "nan"), + # pandas_cell_to_str must turn NaN/"nan" into "" (not the literal string "nan"), # so unambiguous_uprn's truthiness check correctly drops the row. assert top.record.uprn == "" diff --git a/utils/pandas_utils.py b/utils/pandas_utils.py new file mode 100644 index 00000000..b32cde10 --- /dev/null +++ b/utils/pandas_utils.py @@ -0,0 +1,14 @@ +from typing import Any + +import pandas as pd + + +def pandas_cell_to_str(v: Any) -> str: + if v is None or (isinstance(v, float) and pd.isna(v)): + return "" + s = str(v).replace("\xa0", " ") + # get_uprn_candidates runs .astype(str) on UPRN, turning NaN into "nan". + # Treat that as missing so unambiguous_uprn truthiness checks work. + if s.lower() == "nan": + return "" + return s diff --git a/utils/s3.py b/utils/s3.py index a28f074e..13d272e7 100644 --- a/utils/s3.py +++ b/utils/s3.py @@ -6,8 +6,6 @@ from io import BytesIO, StringIO from urllib.parse import unquote from utils.logger import setup_logger from botocore.exceptions import NoCredentialsError, PartialCredentialsError -from typing import Any - logger = setup_logger()