bolstering testing

This commit is contained in:
Khalim Conn-Kowlessar 2026-04-28 13:46:09 +00:00
parent cadf8836d1
commit a1b207ba55
12 changed files with 201 additions and 161 deletions

View file

@ -59,3 +59,9 @@ New containers install all skills automatically via the Dockerfile. If you're in
bash .devcontainer/backend/install-claude-skills.sh bash .devcontainer/backend/install-claude-skills.sh
``` ```
## Type Safety
All new code must pass `pyright` with zero errors under `typeCheckingMode = strict`.
Annotate all function return types. Use `dict[str, Any]` for untyped external API
payloads — never bare `dict`. Add `pandas-stubs` when introducing pandas to a module.

View file

@ -14,3 +14,8 @@ openpyxl==3.1.5
# Basic # Basic
pytz pytz
sqlmodel sqlmodel
# HTTP client
httpx==0.28.1
# Data
pandas
pandas-stubs

View file

@ -1,3 +1,3 @@
from backend.epc_client.client import EpcClientService, EpcSearchResult from backend.epc_client.client import EpcClientService
__all__ = ["EpcClientService", "EpcSearchResult"] __all__ = ["EpcClientService"]

View file

@ -1,11 +1,9 @@
# Spec: https://raw.githubusercontent.com/communitiesuk/epb-data-warehouse/main/api/api.yml # Spec: https://raw.githubusercontent.com/communitiesuk/epb-data-warehouse/main/api/api.yml
from __future__ import annotations from __future__ import annotations
from dataclasses import dataclass from typing import Any, Optional
from typing import Callable, Optional
import httpx import httpx
import pandas as pd
from backend.epc_client.exceptions import ( from backend.epc_client.exceptions import (
EpcApiError, EpcApiError,
@ -15,35 +13,11 @@ from backend.epc_client.exceptions import (
from backend.epc_client._retry import call_with_retry from backend.epc_client._retry import call_with_retry
from datatypes.epc.domain.epc_property_data import EpcPropertyData from datatypes.epc.domain.epc_property_data import EpcPropertyData
from datatypes.epc.domain.mapper import EpcPropertyDataMapper from datatypes.epc.domain.mapper import EpcPropertyDataMapper
from datatypes.epc.search import EpcSearchResult
@dataclass
class EpcSearchResult:
certificate_number: str
address_line_1: str
address_line_2: Optional[str]
address_line_3: Optional[str]
address_line_4: Optional[str]
postcode: str
post_town: str
uprn: Optional[int]
current_energy_efficiency_band: str
registration_date: str
@property
def full_address(self) -> str:
parts = [
self.address_line_1,
self.address_line_2,
self.address_line_3,
self.address_line_4,
]
return ", ".join(p for p in parts if p)
class EpcClientService: class EpcClientService:
BASE_URL = "https://api.get-energy-performance-data.communities.gov.uk" BASE_URL = "https://api.get-energy-performance-data.communities.gov.uk"
_MIN_MATCH_SCORE = 0.6
def __init__(self, auth_token: str) -> None: def __init__(self, auth_token: str) -> None:
self._headers = { self._headers = {
@ -65,34 +39,11 @@ class EpcClientService:
def search_by_postcode(self, postcode: str) -> list[EpcSearchResult]: def search_by_postcode(self, postcode: str) -> list[EpcSearchResult]:
return call_with_retry(lambda: self._search(postcode=postcode)) return call_with_retry(lambda: self._search(postcode=postcode))
def find_best_match(self, postcode: str, address: str) -> Optional[EpcPropertyData]:
from backend.utils.addressMatch import get_uprn_candidates
candidates = self.search_by_postcode(postcode)
if not candidates:
return None
# Round 1: score on addressLine1 only
cert_num = self._pick_best_cert(
candidates, address, use_full_address=False, fn=get_uprn_candidates
)
if cert_num:
return self._safe_get(cert_num)
# Round 2: score on all address lines joined
cert_num = self._pick_best_cert(
candidates, address, use_full_address=True, fn=get_uprn_candidates
)
if cert_num:
return self._safe_get(cert_num)
return None
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# Private helpers # Private helpers
# ------------------------------------------------------------------ # ------------------------------------------------------------------
def _fetch_certificate(self, cert_num: str) -> dict: def _fetch_certificate(self, cert_num: str) -> dict[str, Any]:
resp = httpx.get( resp = httpx.get(
f"{self.BASE_URL}/api/certificate", f"{self.BASE_URL}/api/certificate",
params={"certificate_number": cert_num}, params={"certificate_number": cert_num},
@ -133,7 +84,7 @@ class EpcClientService:
return [self._parse_search_result(r) for r in rows] return [self._parse_search_result(r) for r in rows]
@staticmethod @staticmethod
def _parse_search_result(row: dict) -> EpcSearchResult: def _parse_search_result(row: dict[str, Any]) -> EpcSearchResult:
return EpcSearchResult( return EpcSearchResult(
certificate_number=row["certificateNumber"], certificate_number=row["certificateNumber"],
address_line_1=row["addressLine1"], address_line_1=row["addressLine1"],
@ -146,43 +97,3 @@ class EpcClientService:
current_energy_efficiency_band=row["currentEnergyEfficiencyBand"], current_energy_efficiency_band=row["currentEnergyEfficiencyBand"],
registration_date=row["registrationDate"], registration_date=row["registrationDate"],
) )
def _pick_best_cert(
self,
candidates: list[EpcSearchResult],
user_address: str,
use_full_address: bool,
fn: Callable[..., pd.DataFrame],
) -> Optional[str]:
df = pd.DataFrame(
[
{
"address": (
r.full_address() if use_full_address else r.address_line_1
),
"uprn": str(r.uprn) if r.uprn is not None else "",
"certificate_number": r.certificate_number,
}
for r in candidates
]
)
scored = fn(df, user_address=user_address)
if scored.empty:
return None
best_score = scored.iloc[0]["lexiscore"]
if best_score < self._MIN_MATCH_SCORE:
return None
top = scored[scored["lexirank"] == 1]
if len(top) != 1:
return None
return str(top.iloc[0]["certificate_number"])
def _safe_get(self, cert_num: str) -> Optional[EpcPropertyData]:
try:
return self.get_by_certificate_number(cert_num)
except EpcNotFoundError:
return None

View file

@ -1 +0,0 @@
httpx==0.28.1

View file

@ -1,7 +1,9 @@
from unittest.mock import MagicMock, patch, call from unittest.mock import MagicMock, patch, call
import pytest import pytest
from backend.epc_client.client import EpcClientService, EpcSearchResult from backend.epc_client.client import EpcClientService
from backend.utils.epc_address_match import find_best_epc_match
from datatypes.epc.search import EpcSearchResult
from backend.epc_client.exceptions import EpcNotFoundError, EpcRateLimitError from backend.epc_client.exceptions import EpcNotFoundError, EpcRateLimitError
from datatypes.epc.domain.epc_property_data import EpcPropertyData from datatypes.epc.domain.epc_property_data import EpcPropertyData
from backend.epc_client.tests.conftest import make_search_row from backend.epc_client.tests.conftest import make_search_row
@ -122,88 +124,51 @@ def test_search_by_postcode_404_returns_empty_list(epc_service):
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Tests 8-10: find_best_match # Tests 8-10: find_best_epc_match — real scoring, only HTTP mocked
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
def _make_scored_df(rows, scores, ranks): def test_find_best_match_clear_winner_on_first_pass(epc_service, rdsap_21_0_1_cert):
import pandas as pd
df = pd.DataFrame(rows)
df["lexiscore"] = scores
df["lexirank"] = ranks
return df.sort_values("lexirank")
def test_find_best_match_round1_clear_winner(epc_service, rdsap_21_0_1_cert):
search_rows = [ search_rows = [
make_search_row(cert_num="CERT-WIN", address_line_1="1 High Street"), make_search_row(cert_num="CERT-WIN", address_line_1="1 High Street"),
make_search_row(cert_num="CERT-LOSE", address_line_1="99 Nowhere Lane"), make_search_row(cert_num="CERT-LOSE", address_line_1="99 Nowhere Lane"),
] ]
cert_response = {"data": rdsap_21_0_1_cert} cert_response = {"data": rdsap_21_0_1_cert}
df_rows = [
{"address": "1 High Street", "uprn": "100023336956", "certificate_number": "CERT-WIN"},
{"address": "99 Nowhere Lane", "uprn": "100023336956", "certificate_number": "CERT-LOSE"},
]
scored = _make_scored_df(df_rows, [0.9, 0.1], [1, 2])
def fake_get(url, params=None, **kwargs): def fake_get(url, params=None, **kwargs):
if "search" in url: if "search" in url:
return _mock_response(200, {"data": search_rows}) return _mock_response(200, {"data": search_rows})
return _mock_response(200, cert_response) return _mock_response(200, cert_response)
with patch("httpx.get", side_effect=fake_get), \ with patch("httpx.get", side_effect=fake_get):
patch("backend.utils.addressMatch.get_uprn_candidates", return_value=scored): result = find_best_epc_match(epc_service, "SW1A 1AA", "1 High Street")
result = epc_service.find_best_match("SW1A 1AA", "1 High Street")
assert isinstance(result, EpcPropertyData) assert isinstance(result, EpcPropertyData)
def test_find_best_match_round1_ambiguous_round2_resolves(epc_service, rdsap_21_0_1_cert): def test_find_best_match_resolves_on_second_pass_using_full_address(epc_service, rdsap_21_0_1_cert):
# Both candidates share address_line_1 — round 1 is ambiguous.
# Round 2 scores against full_address and picks the correct floor.
search_rows = [ search_rows = [
make_search_row( make_search_row(
cert_num="CERT-A", address_line_1="1 High Street", cert_num="CERT-A",
address_line_1="1 High Street",
address_line_2="Ground Floor", address_line_2="Ground Floor",
), ),
make_search_row( make_search_row(
cert_num="CERT-B", address_line_1="1 High Street", cert_num="CERT-B",
address_line_1="1 High Street",
address_line_2="First Floor", address_line_2="First Floor",
), ),
] ]
cert_response = {"data": rdsap_21_0_1_cert} cert_response = {"data": rdsap_21_0_1_cert}
# Round 1: both score equally — ambiguous (two rank-1s)
ambiguous = _make_scored_df(
[
{"address": "1 High Street", "uprn": "111", "certificate_number": "CERT-A"},
{"address": "1 High Street", "uprn": "222", "certificate_number": "CERT-B"},
],
[0.9, 0.9],
[1, 1],
)
# Round 2: CERT-A wins on full address
resolved = _make_scored_df(
[
{"address": "1 High Street, Ground Floor", "uprn": "111", "certificate_number": "CERT-A"},
{"address": "1 High Street, First Floor", "uprn": "222", "certificate_number": "CERT-B"},
],
[0.85, 0.4],
[1, 2],
)
call_count = {"n": 0}
def fake_candidates(df, user_address, **kwargs):
call_count["n"] += 1
return ambiguous if call_count["n"] == 1 else resolved
def fake_get(url, params=None, **kwargs): def fake_get(url, params=None, **kwargs):
if "search" in url: if "search" in url:
return _mock_response(200, {"data": search_rows}) return _mock_response(200, {"data": search_rows})
return _mock_response(200, cert_response) return _mock_response(200, cert_response)
with patch("httpx.get", side_effect=fake_get), \ with patch("httpx.get", side_effect=fake_get):
patch("backend.utils.addressMatch.get_uprn_candidates", side_effect=fake_candidates): result = find_best_epc_match(epc_service, "SW1A 1AA", "1 High Street Ground Floor")
result = epc_service.find_best_match("SW1A 1AA", "1 High Street Ground Floor")
assert isinstance(result, EpcPropertyData) assert isinstance(result, EpcPropertyData)
@ -211,14 +176,7 @@ def test_find_best_match_round1_ambiguous_round2_resolves(epc_service, rdsap_21_
def test_find_best_match_returns_none_when_no_good_match(epc_service): def test_find_best_match_returns_none_when_no_good_match(epc_service):
search_rows = [make_search_row(cert_num="CERT-X", address_line_1="99 Nowhere Lane")] search_rows = [make_search_row(cert_num="CERT-X", address_line_1="99 Nowhere Lane")]
low_score = _make_scored_df( with patch("httpx.get", return_value=_mock_response(200, {"data": search_rows})):
[{"address": "99 Nowhere Lane", "uprn": "111", "certificate_number": "CERT-X"}], result = find_best_epc_match(epc_service, "SW1A 1AA", "1 Completely Different Road")
[0.1],
[1],
)
with patch("httpx.get", return_value=_mock_response(200, {"data": search_rows})), \
patch("backend.utils.addressMatch.get_uprn_candidates", return_value=low_score):
result = epc_service.find_best_match("SW1A 1AA", "1 Completely Different Road")
assert result is None assert result is None

View file

@ -0,0 +1,60 @@
from backend.utils.addressMatch import AddressMatch
class TestNormaliseAddress:
def test_lowercases_input(self):
assert AddressMatch.normalise_address("1 HIGH STREET") == "1 high street"
def test_expands_road_abbreviation(self):
assert AddressMatch.normalise_address("1 Moreton Rd") == "1 moreton road"
def test_expands_avenue_abbreviation(self):
assert AddressMatch.normalise_address("2 Park Ave") == "2 park avenue"
def test_removes_punctuation_keeps_slash(self):
result = AddressMatch.normalise_address("Flat 1/A, Some Road")
assert "," not in result
assert "/" in result
def test_splits_digit_letter_suffix(self):
assert "42 a" in AddressMatch.normalise_address("42a Some Road")
def test_empty_string_returns_empty(self):
assert AddressMatch.normalise_address("") == ""
def test_removes_no_prefix(self):
result = AddressMatch.normalise_address("No 5 High Street")
assert "no" not in result.split()
assert "5" in result
class TestScore:
def test_identical_address_scores_one(self):
assert AddressMatch.score("1 High Street", "1 High Street") == 1.0
def test_case_insensitive(self):
assert AddressMatch.score("1 HIGH STREET", "1 high street") == 1.0
def test_street_type_synonym_scores_one(self):
# "Rd" expands to "road" during normalisation — should be identical
assert AddressMatch.score("1 High Rd", "1 High Road") == 1.0
def test_different_building_numbers_score_zero(self):
assert AddressMatch.score("1 High Street", "2 High Street") == 0.0
def test_disjoint_number_sets_score_zero(self):
assert AddressMatch.score("1 High Street", "99 Nowhere Lane") == 0.0
def test_user_address_has_number_but_epc_does_not_scores_zero(self):
assert AddressMatch.score("1 High Street", "High Street") == 0.0
def test_partial_address_scores_above_threshold(self):
# Extra token in user address ("London") — same building number, high overlap
score = AddressMatch.score("1 High Street London", "1 High Street")
assert 0.6 <= score < 1.0
def test_flat_number_mismatch_scores_zero(self):
# User has two numbers but no "flat" token; EPC has different flat number
# Triggers the order-sensitive flat guard
score = AddressMatch.score("3 42 High Street", "Flat 7 42 High Street")
assert score == 0.0

View file

@ -1,8 +1,13 @@
from __future__ import annotations
import re import re
from typing import Any, Optional
from difflib import SequenceMatcher from difflib import SequenceMatcher
from typing import TYPE_CHECKING, Any, Optional
import requests import requests
import pandas as pd
if TYPE_CHECKING:
import pandas as pd
class AddressMatch: class AddressMatch:

View file

@ -0,0 +1,67 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Optional
from backend.utils.addressMatch import AddressMatch
from datatypes.epc.domain.epc_property_data import EpcPropertyData
from datatypes.epc.search import EpcSearchResult
if TYPE_CHECKING:
from backend.epc_client.client import EpcClientService
_MIN_MATCH_SCORE = 0.6
def find_best_epc_match(
service: EpcClientService,
postcode: str,
address: str,
) -> Optional[EpcPropertyData]:
candidates = service.search_by_postcode(postcode)
if not candidates:
return None
cert_num = _pick_best_cert(candidates, address, use_full_address=False)
if cert_num:
return _safe_get(service, cert_num)
cert_num = _pick_best_cert(candidates, address, use_full_address=True)
if cert_num:
return _safe_get(service, cert_num)
return None
def _pick_best_cert(
candidates: list[EpcSearchResult],
user_address: str,
use_full_address: bool,
) -> Optional[str]:
scored: list[tuple[float, str]] = [
(
AddressMatch.score(
user_address,
r.full_address if use_full_address else r.address_line_1,
),
r.certificate_number,
)
for r in candidates
]
if not scored:
return None
best_score = max(s for s, _ in scored)
if best_score < _MIN_MATCH_SCORE:
return None
top = [cert for s, cert in scored if s == best_score]
if len(top) != 1:
return None
return top[0]
def _safe_get(service: EpcClientService, cert_num: str) -> Optional[EpcPropertyData]:
from backend.epc_client.exceptions import EpcNotFoundError
try:
return service.get_by_certificate_number(cert_num)
except EpcNotFoundError:
return None

View file

@ -0,0 +1,3 @@
from datatypes.epc.search.epc_search_result import EpcSearchResult
__all__ = ["EpcSearchResult"]

View file

@ -0,0 +1,28 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import Optional
@dataclass
class EpcSearchResult:
certificate_number: str
address_line_1: str
address_line_2: Optional[str]
address_line_3: Optional[str]
address_line_4: Optional[str]
postcode: str
post_town: str
uprn: Optional[int]
current_energy_efficiency_band: str
registration_date: str
@property
def full_address(self) -> str:
parts = [
self.address_line_1,
self.address_line_2,
self.address_line_3,
self.address_line_4,
]
return ", ".join(p for p in parts if p)

View file

@ -1,3 +1 @@
[tool.pyright] [tool.pyright]
reportUnknownMemberType = false
reportUnknownVariableType = false