changed to utils

This commit is contained in:
Jun-te Kim 2026-05-11 08:37:44 +00:00
parent 7ef5dc4922
commit fb758b76bf
4 changed files with 18 additions and 16 deletions

View file

@ -1,5 +1,5 @@
from dataclasses import dataclass from dataclasses import dataclass
from typing import Any, Optional from typing import Optional
import pandas as pd import pandas as pd
from botocore.exceptions import ClientError from botocore.exceptions import ClientError
@ -7,6 +7,7 @@ from botocore.exceptions import ClientError
from backend.address2UPRN.scoring import get_uprn_candidates from backend.address2UPRN.scoring import get_uprn_candidates
from backend.utils.addressMatch import AddressMatch from backend.utils.addressMatch import AddressMatch
from datatypes.epc.domain.historic_epc import HistoricEpc from datatypes.epc.domain.historic_epc import HistoricEpc
from utils.pandas_utils import pandas_cell_to_str
from utils.s3 import parse_s3_uri, read_csv_gz_from_s3 from utils.s3 import parse_s3_uri, read_csv_gz_from_s3
DEFAULT_S3_ROOT = "s3://retrofit-data-dev/historical_epc" DEFAULT_S3_ROOT = "s3://retrofit-data-dev/historical_epc"
@ -14,20 +15,9 @@ DEFAULT_S3_ROOT = "s3://retrofit-data-dev/historical_epc"
_EXTRA_COLS = {"lexiscore", "lexirank"} _EXTRA_COLS = {"lexiscore", "lexirank"}
def _cell_to_str(v: Any) -> str:
if v is None or (isinstance(v, float) and pd.isna(v)):
return ""
s = str(v).replace("\xa0", " ")
# get_uprn_candidates runs .astype(str) on UPRN, turning NaN into "nan".
# Treat that as missing so unambiguous_uprn truthiness checks work.
if s.lower() == "nan":
return ""
return s
def _row_to_historic_epc(row: pd.Series) -> HistoricEpc: def _row_to_historic_epc(row: pd.Series) -> HistoricEpc:
kwargs = { kwargs = {
col.lower(): _cell_to_str(val) col.lower(): pandas_cell_to_str(val)
for col, val in row.items() for col, val in row.items()
if col.lower() not in _EXTRA_COLS if col.lower() not in _EXTRA_COLS
} }

View file

@ -211,7 +211,7 @@ class TestUnambiguousUprn:
]) ])
result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL")
top = result.top() top = result.top()
# _cell_to_str must turn NaN/"nan" into "" (not the literal string "nan"), # pandas_cell_to_str must turn NaN/"nan" into "" (not the literal string "nan"),
# so unambiguous_uprn's truthiness check correctly drops the row. # so unambiguous_uprn's truthiness check correctly drops the row.
assert top.record.uprn == "" assert top.record.uprn == ""

14
utils/pandas_utils.py Normal file
View file

@ -0,0 +1,14 @@
from typing import Any
import pandas as pd
def pandas_cell_to_str(v: Any) -> str:
if v is None or (isinstance(v, float) and pd.isna(v)):
return ""
s = str(v).replace("\xa0", " ")
# get_uprn_candidates runs .astype(str) on UPRN, turning NaN into "nan".
# Treat that as missing so unambiguous_uprn truthiness checks work.
if s.lower() == "nan":
return ""
return s

View file

@ -6,8 +6,6 @@ from io import BytesIO, StringIO
from urllib.parse import unquote from urllib.parse import unquote
from utils.logger import setup_logger from utils.logger import setup_logger
from botocore.exceptions import NoCredentialsError, PartialCredentialsError from botocore.exceptions import NoCredentialsError, PartialCredentialsError
from typing import Any
logger = setup_logger() logger = setup_logger()