creating lodgment dates data, using old EPC api, to verify test failures

2026-07-27 23:35:01 +00:00 · 2026-04-27 12:15:30 +00:00 · 2026-04-27 12:15:30 +00:00 · 1af6bc6748
commit 1af6bc6748
parent 0955862973
3 changed files with 1348 additions and 9 deletions
--- a/backend/address2UPRN/tests/populate_lodgement_dates.py
+++ b/backend/address2UPRN/tests/populate_lodgement_dates.py
@ -0,0 +1,81 @@
+import csv
+import json
+import os
+from pathlib import Path
+from urllib.parse import urlencode
+
+import pandas as pd
+from epc_api.client import EpcClient
+
+FIXTURE_PATH = Path(__file__).parent / "test_data.csv"
+SIDECAR_PATH = Path(__file__).parent / "test_lodgement_dates.json"
+
+
+def fetch_postcode_records(client: EpcClient, postcode: str) -> pd.DataFrame:
+    url = os.path.join(client.domestic.host, "search")
+    url += "?" + urlencode({"size": 500})
+    resp = client.domestic.call(url=url, method="get", params={"postcode": postcode})
+    if not resp or "rows" not in resp:
+        return pd.DataFrame()
+    return pd.DataFrame(resp["rows"], columns=resp["column-names"])
+
+
+def main():
+    auth_token = os.getenv("EPC_AUTH_TOKEN")
+    if not auth_token:
+        raise RuntimeError("EPC_AUTH_TOKEN not set")
+
+    client = EpcClient(auth_token=auth_token)
+
+    sidecar = {}
+    if SIDECAR_PATH.exists():
+        sidecar = json.loads(SIDECAR_PATH.read_text())
+
+    with open(FIXTURE_PATH, newline="", encoding="utf-8") as f:
+        rows = list(csv.DictReader(f))
+
+    by_postcode: dict[str, list[dict]] = {}
+    for row in rows:
+        if row["Manual UPRN Code"] == "None":
+            continue
+        by_postcode.setdefault(row["Postcode"], []).append(row)
+
+    for postcode, postcode_rows in by_postcode.items():
+        print(f"Fetching {postcode} ({len(postcode_rows)} rows)...")
+        try:
+            epc_df = fetch_postcode_records(client, postcode)
+        except Exception as e:
+            print(f"  ERROR: {e}")
+            continue
+
+        if epc_df.empty:
+            print(f"  No results from old API for {postcode}")
+            continue
+
+        epc_df["uprn"] = epc_df["uprn"].astype(str).str.replace(r"\.0$", "", regex=True)
+
+        for row in postcode_rows:
+            key = f"{row['User Input']}|{row['Postcode']}"
+            if key in sidecar:
+                continue
+
+            expected_uprn = str(row["Manual UPRN Code"]).strip()
+            match = epc_df[epc_df["uprn"] == expected_uprn]
+
+            if match.empty:
+                print(f"  WARN: UPRN {expected_uprn} not found in old API for {postcode}")
+                sidecar[key] = {"lodgement_date": None, "found_in_old_api": False}
+            else:
+                lodgement_date = match.iloc[0].get("lodgement-date")
+                sidecar[key] = {
+                    "lodgement_date": str(lodgement_date) if lodgement_date else None,
+                    "found_in_old_api": True,
+                }
+                print(f"  {row['User Input']}: {lodgement_date}")
+
+    SIDECAR_PATH.write_text(json.dumps(sidecar, indent=2))
+    print(f"\nWritten to {SIDECAR_PATH}")
+
+
+if __name__ == "__main__":
+    main()
--- a/backend/address2UPRN/tests/test_csv.py
+++ b/backend/address2UPRN/tests/test_csv.py
@ -1,25 +1,54 @@
 # tests/test_address_to_uprn_csv.py

 import csv
+import json
 import pytest
+from datetime import date
 from pathlib import Path
 from backend.address2UPRN.main import get_uprn

 FIXTURE_PATH = Path(__file__).parent / "test_data.csv"
+SIDECAR_PATH = Path(__file__).parent / "test_lodgement_dates.json"
+NEW_API_CUTOFF = date(2012, 1, 1)
+
+
+def _load_sidecar() -> dict:
+    if SIDECAR_PATH.exists():
+        return json.loads(SIDECAR_PATH.read_text())
+    return {}


 def load_test_cases():
+    sidecar = _load_sidecar()
    with open(FIXTURE_PATH, newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
-        return [
-            pytest.param(
-                row["User Input"],
-                row["Postcode"],
-                row["Manual UPRN Code"],
-                id=f'{row["User Input"]} [{row["Postcode"]}]',
+        cases = []
+        for row in reader:
+            key = f"{row['User Input']}|{row['Postcode']}"
+            entry = sidecar.get(key, {})
+            lodgement_date = entry.get("lodgement_date")
+
+            marks = []
+            if lodgement_date:
+                parsed = date.fromisoformat(lodgement_date[:10])
+                if parsed < NEW_API_CUTOFF:
+                    marks.append(
+                        pytest.mark.xfail(
+                            reason=f"EPC lodged {lodgement_date} — predates new API coverage (Jan 2012)",
+                            strict=False,
+                        )
+                    )
+
+            cases.append(
+                pytest.param(
+                    row["User Input"],
+                    row["Postcode"],
+                    row["Manual UPRN Code"],
+                    id=f'{row["User Input"]} [{row["Postcode"]}]',
+                    marks=marks,
+                )
            )
-            for row in reader
-        ]
+        return cases


@pytest.mark.parametrize(
@ -31,7 +60,6 @@ def test_uprn_resolution_matches_manual(
    postcode: str,
    expected_uprn: str,
 ):
-
    uprn = get_uprn(user_input, postcode)
    if uprn:
        assert uprn == expected_uprn
--- a/backend/address2UPRN/tests/test_lodgement_dates.json
+++ b/backend/address2UPRN/tests/test_lodgement_dates.json