From fb758b76bf2dcecbed486b569b9fa5e345a85ddc Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Mon, 11 May 2026 08:37:44 +0000
Subject: [PATCH] changed to utils

---
 datatypes/epc/domain/historic_epc_matching.py    | 16 +++-------------
 .../domain/tests/test_historic_epc_matching.py   |  2 +-
 utils/pandas_utils.py                            | 14 ++++++++++++++
 utils/s3.py                                      |  2 --
 4 files changed, 18 insertions(+), 16 deletions(-)
 create mode 100644 utils/pandas_utils.py

diff --git a/datatypes/epc/domain/historic_epc_matching.py b/datatypes/epc/domain/historic_epc_matching.py
index 53f602ae..2eb590e8 100644
--- a/datatypes/epc/domain/historic_epc_matching.py
+++ b/datatypes/epc/domain/historic_epc_matching.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Any, Optional
+from typing import Optional
 
 import pandas as pd
 from botocore.exceptions import ClientError
@@ -7,6 +7,7 @@ from botocore.exceptions import ClientError
 from backend.address2UPRN.scoring import get_uprn_candidates
 from backend.utils.addressMatch import AddressMatch
 from datatypes.epc.domain.historic_epc import HistoricEpc
+from utils.pandas_utils import pandas_cell_to_str
 from utils.s3 import parse_s3_uri, read_csv_gz_from_s3
 
 DEFAULT_S3_ROOT = "s3://retrofit-data-dev/historical_epc"
@@ -14,20 +15,9 @@ DEFAULT_S3_ROOT = "s3://retrofit-data-dev/historical_epc"
 _EXTRA_COLS = {"lexiscore", "lexirank"}
 
 
-def _cell_to_str(v: Any) -> str:
-    if v is None or (isinstance(v, float) and pd.isna(v)):
-        return ""
-    s = str(v).replace("\xa0", " ")
-    # get_uprn_candidates runs .astype(str) on UPRN, turning NaN into "nan".
-    # Treat that as missing so unambiguous_uprn truthiness checks work.
-    if s.lower() == "nan":
-        return ""
-    return s
-
-
 def _row_to_historic_epc(row: pd.Series) -> HistoricEpc:
     kwargs = {
-        col.lower(): _cell_to_str(val)
+        col.lower(): pandas_cell_to_str(val)
         for col, val in row.items()
         if col.lower() not in _EXTRA_COLS
     }
diff --git a/datatypes/epc/domain/tests/test_historic_epc_matching.py b/datatypes/epc/domain/tests/test_historic_epc_matching.py
index c23846e1..1c3ee6d4 100644
--- a/datatypes/epc/domain/tests/test_historic_epc_matching.py
+++ b/datatypes/epc/domain/tests/test_historic_epc_matching.py
@@ -211,7 +211,7 @@ class TestUnambiguousUprn:
         ])
         result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL")
         top = result.top()
-        # _cell_to_str must turn NaN/"nan" into "" (not the literal string "nan"),
+        # pandas_cell_to_str must turn NaN/"nan" into "" (not the literal string "nan"),
         # so unambiguous_uprn's truthiness check correctly drops the row.
         assert top.record.uprn == ""
 
diff --git a/utils/pandas_utils.py b/utils/pandas_utils.py
new file mode 100644
index 00000000..b32cde10
--- /dev/null
+++ b/utils/pandas_utils.py
@@ -0,0 +1,14 @@
+from typing import Any
+
+import pandas as pd
+
+
+def pandas_cell_to_str(v: Any) -> str:
+    if v is None or (isinstance(v, float) and pd.isna(v)):
+        return ""
+    s = str(v).replace("\xa0", " ")
+    # get_uprn_candidates runs .astype(str) on UPRN, turning NaN into "nan".
+    # Treat that as missing so unambiguous_uprn truthiness checks work.
+    if s.lower() == "nan":
+        return ""
+    return s
diff --git a/utils/s3.py b/utils/s3.py
index a28f074e..13d272e7 100644
--- a/utils/s3.py
+++ b/utils/s3.py
@@ -6,8 +6,6 @@ from io import BytesIO, StringIO
 from urllib.parse import unquote
 from utils.logger import setup_logger
 from botocore.exceptions import NoCredentialsError, PartialCredentialsError
-from typing import Any
-
 logger = setup_logger()