mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
bolstering testing
This commit is contained in:
parent
cadf8836d1
commit
a1b207ba55
12 changed files with 201 additions and 161 deletions
|
|
@ -59,3 +59,9 @@ New containers install all skills automatically via the Dockerfile. If you're in
|
|||
bash .devcontainer/backend/install-claude-skills.sh
|
||||
```
|
||||
|
||||
## Type Safety
|
||||
|
||||
All new code must pass `pyright` with zero errors under `typeCheckingMode = strict`.
|
||||
Annotate all function return types. Use `dict[str, Any]` for untyped external API
|
||||
payloads — never bare `dict`. Add `pandas-stubs` when introducing pandas to a module.
|
||||
|
||||
|
|
|
|||
|
|
@ -14,3 +14,8 @@ openpyxl==3.1.5
|
|||
# Basic
|
||||
pytz
|
||||
sqlmodel
|
||||
# HTTP client
|
||||
httpx==0.28.1
|
||||
# Data
|
||||
pandas
|
||||
pandas-stubs
|
||||
|
|
@ -1,3 +1,3 @@
|
|||
from backend.epc_client.client import EpcClientService, EpcSearchResult
|
||||
from backend.epc_client.client import EpcClientService
|
||||
|
||||
__all__ = ["EpcClientService", "EpcSearchResult"]
|
||||
__all__ = ["EpcClientService"]
|
||||
|
|
|
|||
|
|
@ -1,11 +1,9 @@
|
|||
# Spec: https://raw.githubusercontent.com/communitiesuk/epb-data-warehouse/main/api/api.yml
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Callable, Optional
|
||||
from typing import Any, Optional
|
||||
|
||||
import httpx
|
||||
import pandas as pd
|
||||
|
||||
from backend.epc_client.exceptions import (
|
||||
EpcApiError,
|
||||
|
|
@ -15,35 +13,11 @@ from backend.epc_client.exceptions import (
|
|||
from backend.epc_client._retry import call_with_retry
|
||||
from datatypes.epc.domain.epc_property_data import EpcPropertyData
|
||||
from datatypes.epc.domain.mapper import EpcPropertyDataMapper
|
||||
|
||||
|
||||
@dataclass
|
||||
class EpcSearchResult:
|
||||
certificate_number: str
|
||||
address_line_1: str
|
||||
address_line_2: Optional[str]
|
||||
address_line_3: Optional[str]
|
||||
address_line_4: Optional[str]
|
||||
postcode: str
|
||||
post_town: str
|
||||
uprn: Optional[int]
|
||||
current_energy_efficiency_band: str
|
||||
registration_date: str
|
||||
|
||||
@property
|
||||
def full_address(self) -> str:
|
||||
parts = [
|
||||
self.address_line_1,
|
||||
self.address_line_2,
|
||||
self.address_line_3,
|
||||
self.address_line_4,
|
||||
]
|
||||
return ", ".join(p for p in parts if p)
|
||||
from datatypes.epc.search import EpcSearchResult
|
||||
|
||||
|
||||
class EpcClientService:
|
||||
BASE_URL = "https://api.get-energy-performance-data.communities.gov.uk"
|
||||
_MIN_MATCH_SCORE = 0.6
|
||||
|
||||
def __init__(self, auth_token: str) -> None:
|
||||
self._headers = {
|
||||
|
|
@ -65,34 +39,11 @@ class EpcClientService:
|
|||
def search_by_postcode(self, postcode: str) -> list[EpcSearchResult]:
|
||||
return call_with_retry(lambda: self._search(postcode=postcode))
|
||||
|
||||
def find_best_match(self, postcode: str, address: str) -> Optional[EpcPropertyData]:
|
||||
from backend.utils.addressMatch import get_uprn_candidates
|
||||
|
||||
candidates = self.search_by_postcode(postcode)
|
||||
if not candidates:
|
||||
return None
|
||||
|
||||
# Round 1: score on addressLine1 only
|
||||
cert_num = self._pick_best_cert(
|
||||
candidates, address, use_full_address=False, fn=get_uprn_candidates
|
||||
)
|
||||
if cert_num:
|
||||
return self._safe_get(cert_num)
|
||||
|
||||
# Round 2: score on all address lines joined
|
||||
cert_num = self._pick_best_cert(
|
||||
candidates, address, use_full_address=True, fn=get_uprn_candidates
|
||||
)
|
||||
if cert_num:
|
||||
return self._safe_get(cert_num)
|
||||
|
||||
return None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Private helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _fetch_certificate(self, cert_num: str) -> dict:
|
||||
def _fetch_certificate(self, cert_num: str) -> dict[str, Any]:
|
||||
resp = httpx.get(
|
||||
f"{self.BASE_URL}/api/certificate",
|
||||
params={"certificate_number": cert_num},
|
||||
|
|
@ -133,7 +84,7 @@ class EpcClientService:
|
|||
return [self._parse_search_result(r) for r in rows]
|
||||
|
||||
@staticmethod
|
||||
def _parse_search_result(row: dict) -> EpcSearchResult:
|
||||
def _parse_search_result(row: dict[str, Any]) -> EpcSearchResult:
|
||||
return EpcSearchResult(
|
||||
certificate_number=row["certificateNumber"],
|
||||
address_line_1=row["addressLine1"],
|
||||
|
|
@ -146,43 +97,3 @@ class EpcClientService:
|
|||
current_energy_efficiency_band=row["currentEnergyEfficiencyBand"],
|
||||
registration_date=row["registrationDate"],
|
||||
)
|
||||
|
||||
def _pick_best_cert(
|
||||
self,
|
||||
candidates: list[EpcSearchResult],
|
||||
user_address: str,
|
||||
use_full_address: bool,
|
||||
fn: Callable[..., pd.DataFrame],
|
||||
) -> Optional[str]:
|
||||
df = pd.DataFrame(
|
||||
[
|
||||
{
|
||||
"address": (
|
||||
r.full_address() if use_full_address else r.address_line_1
|
||||
),
|
||||
"uprn": str(r.uprn) if r.uprn is not None else "",
|
||||
"certificate_number": r.certificate_number,
|
||||
}
|
||||
for r in candidates
|
||||
]
|
||||
)
|
||||
|
||||
scored = fn(df, user_address=user_address)
|
||||
if scored.empty:
|
||||
return None
|
||||
|
||||
best_score = scored.iloc[0]["lexiscore"]
|
||||
if best_score < self._MIN_MATCH_SCORE:
|
||||
return None
|
||||
|
||||
top = scored[scored["lexirank"] == 1]
|
||||
if len(top) != 1:
|
||||
return None
|
||||
|
||||
return str(top.iloc[0]["certificate_number"])
|
||||
|
||||
def _safe_get(self, cert_num: str) -> Optional[EpcPropertyData]:
|
||||
try:
|
||||
return self.get_by_certificate_number(cert_num)
|
||||
except EpcNotFoundError:
|
||||
return None
|
||||
|
|
|
|||
|
|
@ -1 +0,0 @@
|
|||
httpx==0.28.1
|
||||
|
|
@ -1,7 +1,9 @@
|
|||
from unittest.mock import MagicMock, patch, call
|
||||
import pytest
|
||||
|
||||
from backend.epc_client.client import EpcClientService, EpcSearchResult
|
||||
from backend.epc_client.client import EpcClientService
|
||||
from backend.utils.epc_address_match import find_best_epc_match
|
||||
from datatypes.epc.search import EpcSearchResult
|
||||
from backend.epc_client.exceptions import EpcNotFoundError, EpcRateLimitError
|
||||
from datatypes.epc.domain.epc_property_data import EpcPropertyData
|
||||
from backend.epc_client.tests.conftest import make_search_row
|
||||
|
|
@ -122,88 +124,51 @@ def test_search_by_postcode_404_returns_empty_list(epc_service):
|
|||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests 8-10: find_best_match
|
||||
# Tests 8-10: find_best_epc_match — real scoring, only HTTP mocked
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _make_scored_df(rows, scores, ranks):
|
||||
import pandas as pd
|
||||
df = pd.DataFrame(rows)
|
||||
df["lexiscore"] = scores
|
||||
df["lexirank"] = ranks
|
||||
return df.sort_values("lexirank")
|
||||
|
||||
|
||||
def test_find_best_match_round1_clear_winner(epc_service, rdsap_21_0_1_cert):
|
||||
def test_find_best_match_clear_winner_on_first_pass(epc_service, rdsap_21_0_1_cert):
|
||||
search_rows = [
|
||||
make_search_row(cert_num="CERT-WIN", address_line_1="1 High Street"),
|
||||
make_search_row(cert_num="CERT-LOSE", address_line_1="99 Nowhere Lane"),
|
||||
]
|
||||
cert_response = {"data": rdsap_21_0_1_cert}
|
||||
|
||||
df_rows = [
|
||||
{"address": "1 High Street", "uprn": "100023336956", "certificate_number": "CERT-WIN"},
|
||||
{"address": "99 Nowhere Lane", "uprn": "100023336956", "certificate_number": "CERT-LOSE"},
|
||||
]
|
||||
scored = _make_scored_df(df_rows, [0.9, 0.1], [1, 2])
|
||||
|
||||
def fake_get(url, params=None, **kwargs):
|
||||
if "search" in url:
|
||||
return _mock_response(200, {"data": search_rows})
|
||||
return _mock_response(200, cert_response)
|
||||
|
||||
with patch("httpx.get", side_effect=fake_get), \
|
||||
patch("backend.utils.addressMatch.get_uprn_candidates", return_value=scored):
|
||||
result = epc_service.find_best_match("SW1A 1AA", "1 High Street")
|
||||
with patch("httpx.get", side_effect=fake_get):
|
||||
result = find_best_epc_match(epc_service, "SW1A 1AA", "1 High Street")
|
||||
|
||||
assert isinstance(result, EpcPropertyData)
|
||||
|
||||
|
||||
def test_find_best_match_round1_ambiguous_round2_resolves(epc_service, rdsap_21_0_1_cert):
|
||||
def test_find_best_match_resolves_on_second_pass_using_full_address(epc_service, rdsap_21_0_1_cert):
|
||||
# Both candidates share address_line_1 — round 1 is ambiguous.
|
||||
# Round 2 scores against full_address and picks the correct floor.
|
||||
search_rows = [
|
||||
make_search_row(
|
||||
cert_num="CERT-A", address_line_1="1 High Street",
|
||||
cert_num="CERT-A",
|
||||
address_line_1="1 High Street",
|
||||
address_line_2="Ground Floor",
|
||||
),
|
||||
make_search_row(
|
||||
cert_num="CERT-B", address_line_1="1 High Street",
|
||||
cert_num="CERT-B",
|
||||
address_line_1="1 High Street",
|
||||
address_line_2="First Floor",
|
||||
),
|
||||
]
|
||||
cert_response = {"data": rdsap_21_0_1_cert}
|
||||
|
||||
# Round 1: both score equally — ambiguous (two rank-1s)
|
||||
ambiguous = _make_scored_df(
|
||||
[
|
||||
{"address": "1 High Street", "uprn": "111", "certificate_number": "CERT-A"},
|
||||
{"address": "1 High Street", "uprn": "222", "certificate_number": "CERT-B"},
|
||||
],
|
||||
[0.9, 0.9],
|
||||
[1, 1],
|
||||
)
|
||||
# Round 2: CERT-A wins on full address
|
||||
resolved = _make_scored_df(
|
||||
[
|
||||
{"address": "1 High Street, Ground Floor", "uprn": "111", "certificate_number": "CERT-A"},
|
||||
{"address": "1 High Street, First Floor", "uprn": "222", "certificate_number": "CERT-B"},
|
||||
],
|
||||
[0.85, 0.4],
|
||||
[1, 2],
|
||||
)
|
||||
|
||||
call_count = {"n": 0}
|
||||
|
||||
def fake_candidates(df, user_address, **kwargs):
|
||||
call_count["n"] += 1
|
||||
return ambiguous if call_count["n"] == 1 else resolved
|
||||
|
||||
def fake_get(url, params=None, **kwargs):
|
||||
if "search" in url:
|
||||
return _mock_response(200, {"data": search_rows})
|
||||
return _mock_response(200, cert_response)
|
||||
|
||||
with patch("httpx.get", side_effect=fake_get), \
|
||||
patch("backend.utils.addressMatch.get_uprn_candidates", side_effect=fake_candidates):
|
||||
result = epc_service.find_best_match("SW1A 1AA", "1 High Street Ground Floor")
|
||||
with patch("httpx.get", side_effect=fake_get):
|
||||
result = find_best_epc_match(epc_service, "SW1A 1AA", "1 High Street Ground Floor")
|
||||
|
||||
assert isinstance(result, EpcPropertyData)
|
||||
|
||||
|
|
@ -211,14 +176,7 @@ def test_find_best_match_round1_ambiguous_round2_resolves(epc_service, rdsap_21_
|
|||
def test_find_best_match_returns_none_when_no_good_match(epc_service):
|
||||
search_rows = [make_search_row(cert_num="CERT-X", address_line_1="99 Nowhere Lane")]
|
||||
|
||||
low_score = _make_scored_df(
|
||||
[{"address": "99 Nowhere Lane", "uprn": "111", "certificate_number": "CERT-X"}],
|
||||
[0.1],
|
||||
[1],
|
||||
)
|
||||
|
||||
with patch("httpx.get", return_value=_mock_response(200, {"data": search_rows})), \
|
||||
patch("backend.utils.addressMatch.get_uprn_candidates", return_value=low_score):
|
||||
result = epc_service.find_best_match("SW1A 1AA", "1 Completely Different Road")
|
||||
with patch("httpx.get", return_value=_mock_response(200, {"data": search_rows})):
|
||||
result = find_best_epc_match(epc_service, "SW1A 1AA", "1 Completely Different Road")
|
||||
|
||||
assert result is None
|
||||
|
|
|
|||
60
backend/tests/test_address_match.py
Normal file
60
backend/tests/test_address_match.py
Normal file
|
|
@ -0,0 +1,60 @@
|
|||
from backend.utils.addressMatch import AddressMatch
|
||||
|
||||
|
||||
class TestNormaliseAddress:
|
||||
def test_lowercases_input(self):
|
||||
assert AddressMatch.normalise_address("1 HIGH STREET") == "1 high street"
|
||||
|
||||
def test_expands_road_abbreviation(self):
|
||||
assert AddressMatch.normalise_address("1 Moreton Rd") == "1 moreton road"
|
||||
|
||||
def test_expands_avenue_abbreviation(self):
|
||||
assert AddressMatch.normalise_address("2 Park Ave") == "2 park avenue"
|
||||
|
||||
def test_removes_punctuation_keeps_slash(self):
|
||||
result = AddressMatch.normalise_address("Flat 1/A, Some Road")
|
||||
assert "," not in result
|
||||
assert "/" in result
|
||||
|
||||
def test_splits_digit_letter_suffix(self):
|
||||
assert "42 a" in AddressMatch.normalise_address("42a Some Road")
|
||||
|
||||
def test_empty_string_returns_empty(self):
|
||||
assert AddressMatch.normalise_address("") == ""
|
||||
|
||||
def test_removes_no_prefix(self):
|
||||
result = AddressMatch.normalise_address("No 5 High Street")
|
||||
assert "no" not in result.split()
|
||||
assert "5" in result
|
||||
|
||||
|
||||
class TestScore:
|
||||
def test_identical_address_scores_one(self):
|
||||
assert AddressMatch.score("1 High Street", "1 High Street") == 1.0
|
||||
|
||||
def test_case_insensitive(self):
|
||||
assert AddressMatch.score("1 HIGH STREET", "1 high street") == 1.0
|
||||
|
||||
def test_street_type_synonym_scores_one(self):
|
||||
# "Rd" expands to "road" during normalisation — should be identical
|
||||
assert AddressMatch.score("1 High Rd", "1 High Road") == 1.0
|
||||
|
||||
def test_different_building_numbers_score_zero(self):
|
||||
assert AddressMatch.score("1 High Street", "2 High Street") == 0.0
|
||||
|
||||
def test_disjoint_number_sets_score_zero(self):
|
||||
assert AddressMatch.score("1 High Street", "99 Nowhere Lane") == 0.0
|
||||
|
||||
def test_user_address_has_number_but_epc_does_not_scores_zero(self):
|
||||
assert AddressMatch.score("1 High Street", "High Street") == 0.0
|
||||
|
||||
def test_partial_address_scores_above_threshold(self):
|
||||
# Extra token in user address ("London") — same building number, high overlap
|
||||
score = AddressMatch.score("1 High Street London", "1 High Street")
|
||||
assert 0.6 <= score < 1.0
|
||||
|
||||
def test_flat_number_mismatch_scores_zero(self):
|
||||
# User has two numbers but no "flat" token; EPC has different flat number
|
||||
# Triggers the order-sensitive flat guard
|
||||
score = AddressMatch.score("3 42 High Street", "Flat 7 42 High Street")
|
||||
assert score == 0.0
|
||||
|
|
@ -1,8 +1,13 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Any, Optional
|
||||
from difflib import SequenceMatcher
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
|
||||
import requests
|
||||
import pandas as pd
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class AddressMatch:
|
||||
|
|
|
|||
67
backend/utils/epc_address_match.py
Normal file
67
backend/utils/epc_address_match.py
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
from backend.utils.addressMatch import AddressMatch
|
||||
from datatypes.epc.domain.epc_property_data import EpcPropertyData
|
||||
from datatypes.epc.search import EpcSearchResult
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from backend.epc_client.client import EpcClientService
|
||||
|
||||
_MIN_MATCH_SCORE = 0.6
|
||||
|
||||
|
||||
def find_best_epc_match(
|
||||
service: EpcClientService,
|
||||
postcode: str,
|
||||
address: str,
|
||||
) -> Optional[EpcPropertyData]:
|
||||
candidates = service.search_by_postcode(postcode)
|
||||
if not candidates:
|
||||
return None
|
||||
|
||||
cert_num = _pick_best_cert(candidates, address, use_full_address=False)
|
||||
if cert_num:
|
||||
return _safe_get(service, cert_num)
|
||||
|
||||
cert_num = _pick_best_cert(candidates, address, use_full_address=True)
|
||||
if cert_num:
|
||||
return _safe_get(service, cert_num)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _pick_best_cert(
|
||||
candidates: list[EpcSearchResult],
|
||||
user_address: str,
|
||||
use_full_address: bool,
|
||||
) -> Optional[str]:
|
||||
scored: list[tuple[float, str]] = [
|
||||
(
|
||||
AddressMatch.score(
|
||||
user_address,
|
||||
r.full_address if use_full_address else r.address_line_1,
|
||||
),
|
||||
r.certificate_number,
|
||||
)
|
||||
for r in candidates
|
||||
]
|
||||
if not scored:
|
||||
return None
|
||||
best_score = max(s for s, _ in scored)
|
||||
if best_score < _MIN_MATCH_SCORE:
|
||||
return None
|
||||
top = [cert for s, cert in scored if s == best_score]
|
||||
if len(top) != 1:
|
||||
return None
|
||||
return top[0]
|
||||
|
||||
|
||||
def _safe_get(service: EpcClientService, cert_num: str) -> Optional[EpcPropertyData]:
|
||||
from backend.epc_client.exceptions import EpcNotFoundError
|
||||
|
||||
try:
|
||||
return service.get_by_certificate_number(cert_num)
|
||||
except EpcNotFoundError:
|
||||
return None
|
||||
3
datatypes/epc/search/__init__.py
Normal file
3
datatypes/epc/search/__init__.py
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
from datatypes.epc.search.epc_search_result import EpcSearchResult
|
||||
|
||||
__all__ = ["EpcSearchResult"]
|
||||
28
datatypes/epc/search/epc_search_result.py
Normal file
28
datatypes/epc/search/epc_search_result.py
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class EpcSearchResult:
|
||||
certificate_number: str
|
||||
address_line_1: str
|
||||
address_line_2: Optional[str]
|
||||
address_line_3: Optional[str]
|
||||
address_line_4: Optional[str]
|
||||
postcode: str
|
||||
post_town: str
|
||||
uprn: Optional[int]
|
||||
current_energy_efficiency_band: str
|
||||
registration_date: str
|
||||
|
||||
@property
|
||||
def full_address(self) -> str:
|
||||
parts = [
|
||||
self.address_line_1,
|
||||
self.address_line_2,
|
||||
self.address_line_3,
|
||||
self.address_line_4,
|
||||
]
|
||||
return ", ".join(p for p in parts if p)
|
||||
|
|
@ -1,3 +1 @@
|
|||
[tool.pyright]
|
||||
reportUnknownMemberType = false
|
||||
reportUnknownVariableType = false
|
||||
Loading…
Add table
Reference in a new issue