changed to utils

This commit is contained in:
Jun-te Kim 2026-05-11 08:37:44 +00:00
parent 7ef5dc4922
commit fb758b76bf
4 changed files with 18 additions and 16 deletions

View file

@ -1,5 +1,5 @@
from dataclasses import dataclass
from typing import Any, Optional
from typing import Optional
import pandas as pd
from botocore.exceptions import ClientError
@ -7,6 +7,7 @@ from botocore.exceptions import ClientError
from backend.address2UPRN.scoring import get_uprn_candidates
from backend.utils.addressMatch import AddressMatch
from datatypes.epc.domain.historic_epc import HistoricEpc
from utils.pandas_utils import pandas_cell_to_str
from utils.s3 import parse_s3_uri, read_csv_gz_from_s3
DEFAULT_S3_ROOT = "s3://retrofit-data-dev/historical_epc"
@ -14,20 +15,9 @@ DEFAULT_S3_ROOT = "s3://retrofit-data-dev/historical_epc"
_EXTRA_COLS = {"lexiscore", "lexirank"}
def _cell_to_str(v: Any) -> str:
if v is None or (isinstance(v, float) and pd.isna(v)):
return ""
s = str(v).replace("\xa0", " ")
# get_uprn_candidates runs .astype(str) on UPRN, turning NaN into "nan".
# Treat that as missing so unambiguous_uprn truthiness checks work.
if s.lower() == "nan":
return ""
return s
def _row_to_historic_epc(row: pd.Series) -> HistoricEpc:
kwargs = {
col.lower(): _cell_to_str(val)
col.lower(): pandas_cell_to_str(val)
for col, val in row.items()
if col.lower() not in _EXTRA_COLS
}

View file

@ -211,7 +211,7 @@ class TestUnambiguousUprn:
])
result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL")
top = result.top()
# _cell_to_str must turn NaN/"nan" into "" (not the literal string "nan"),
# pandas_cell_to_str must turn NaN/"nan" into "" (not the literal string "nan"),
# so unambiguous_uprn's truthiness check correctly drops the row.
assert top.record.uprn == ""

14
utils/pandas_utils.py Normal file
View file

@ -0,0 +1,14 @@
from typing import Any
import pandas as pd
def pandas_cell_to_str(v: Any) -> str:
if v is None or (isinstance(v, float) and pd.isna(v)):
return ""
s = str(v).replace("\xa0", " ")
# get_uprn_candidates runs .astype(str) on UPRN, turning NaN into "nan".
# Treat that as missing so unambiguous_uprn truthiness checks work.
if s.lower() == "nan":
return ""
return s

View file

@ -6,8 +6,6 @@ from io import BytesIO, StringIO
from urllib.parse import unquote
from utils.logger import setup_logger
from botocore.exceptions import NoCredentialsError, PartialCredentialsError
from typing import Any
logger = setup_logger()