diff --git a/CLAUDE.md b/CLAUDE.md index 263679ff..23d465a7 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -59,3 +59,9 @@ New containers install all skills automatically via the Dockerfile. If you're in bash .devcontainer/backend/install-claude-skills.sh ``` +## Type Safety + +All new code must pass `pyright` with zero errors under `typeCheckingMode = strict`. +Annotate all function return types. Use `dict[str, Any]` for untyped external API +payloads — never bare `dict`. Add `pandas-stubs` when introducing pandas to a module. + diff --git a/backend/app/requirements/requirements.txt b/backend/app/requirements/requirements.txt index 9fdbfe4c..80907a79 100644 --- a/backend/app/requirements/requirements.txt +++ b/backend/app/requirements/requirements.txt @@ -13,4 +13,9 @@ boto3==1.35.44 openpyxl==3.1.5 # Basic pytz -sqlmodel \ No newline at end of file +sqlmodel +# HTTP client +httpx==0.28.1 +# Data +pandas +pandas-stubs \ No newline at end of file diff --git a/backend/epc_client/__init__.py b/backend/epc_client/__init__.py index 720594f7..ab46a266 100644 --- a/backend/epc_client/__init__.py +++ b/backend/epc_client/__init__.py @@ -1,3 +1,3 @@ -from backend.epc_client.client import EpcClientService, EpcSearchResult +from backend.epc_client.client import EpcClientService -__all__ = ["EpcClientService", "EpcSearchResult"] +__all__ = ["EpcClientService"] diff --git a/backend/epc_client/client.py b/backend/epc_client/client.py index 0e3b48fc..d00a164f 100644 --- a/backend/epc_client/client.py +++ b/backend/epc_client/client.py @@ -1,11 +1,9 @@ # Spec: https://raw.githubusercontent.com/communitiesuk/epb-data-warehouse/main/api/api.yml from __future__ import annotations -from dataclasses import dataclass -from typing import Callable, Optional +from typing import Any, Optional import httpx -import pandas as pd from backend.epc_client.exceptions import ( EpcApiError, @@ -15,35 +13,11 @@ from backend.epc_client.exceptions import ( from backend.epc_client._retry import call_with_retry from datatypes.epc.domain.epc_property_data import EpcPropertyData from datatypes.epc.domain.mapper import EpcPropertyDataMapper - - -@dataclass -class EpcSearchResult: - certificate_number: str - address_line_1: str - address_line_2: Optional[str] - address_line_3: Optional[str] - address_line_4: Optional[str] - postcode: str - post_town: str - uprn: Optional[int] - current_energy_efficiency_band: str - registration_date: str - - @property - def full_address(self) -> str: - parts = [ - self.address_line_1, - self.address_line_2, - self.address_line_3, - self.address_line_4, - ] - return ", ".join(p for p in parts if p) +from datatypes.epc.search import EpcSearchResult class EpcClientService: BASE_URL = "https://api.get-energy-performance-data.communities.gov.uk" - _MIN_MATCH_SCORE = 0.6 def __init__(self, auth_token: str) -> None: self._headers = { @@ -65,34 +39,11 @@ class EpcClientService: def search_by_postcode(self, postcode: str) -> list[EpcSearchResult]: return call_with_retry(lambda: self._search(postcode=postcode)) - def find_best_match(self, postcode: str, address: str) -> Optional[EpcPropertyData]: - from backend.utils.addressMatch import get_uprn_candidates - - candidates = self.search_by_postcode(postcode) - if not candidates: - return None - - # Round 1: score on addressLine1 only - cert_num = self._pick_best_cert( - candidates, address, use_full_address=False, fn=get_uprn_candidates - ) - if cert_num: - return self._safe_get(cert_num) - - # Round 2: score on all address lines joined - cert_num = self._pick_best_cert( - candidates, address, use_full_address=True, fn=get_uprn_candidates - ) - if cert_num: - return self._safe_get(cert_num) - - return None - # ------------------------------------------------------------------ # Private helpers # ------------------------------------------------------------------ - def _fetch_certificate(self, cert_num: str) -> dict: + def _fetch_certificate(self, cert_num: str) -> dict[str, Any]: resp = httpx.get( f"{self.BASE_URL}/api/certificate", params={"certificate_number": cert_num}, @@ -133,7 +84,7 @@ class EpcClientService: return [self._parse_search_result(r) for r in rows] @staticmethod - def _parse_search_result(row: dict) -> EpcSearchResult: + def _parse_search_result(row: dict[str, Any]) -> EpcSearchResult: return EpcSearchResult( certificate_number=row["certificateNumber"], address_line_1=row["addressLine1"], @@ -146,43 +97,3 @@ class EpcClientService: current_energy_efficiency_band=row["currentEnergyEfficiencyBand"], registration_date=row["registrationDate"], ) - - def _pick_best_cert( - self, - candidates: list[EpcSearchResult], - user_address: str, - use_full_address: bool, - fn: Callable[..., pd.DataFrame], - ) -> Optional[str]: - df = pd.DataFrame( - [ - { - "address": ( - r.full_address() if use_full_address else r.address_line_1 - ), - "uprn": str(r.uprn) if r.uprn is not None else "", - "certificate_number": r.certificate_number, - } - for r in candidates - ] - ) - - scored = fn(df, user_address=user_address) - if scored.empty: - return None - - best_score = scored.iloc[0]["lexiscore"] - if best_score < self._MIN_MATCH_SCORE: - return None - - top = scored[scored["lexirank"] == 1] - if len(top) != 1: - return None - - return str(top.iloc[0]["certificate_number"]) - - def _safe_get(self, cert_num: str) -> Optional[EpcPropertyData]: - try: - return self.get_by_certificate_number(cert_num) - except EpcNotFoundError: - return None diff --git a/backend/epc_client/requirements.txt b/backend/epc_client/requirements.txt deleted file mode 100644 index cee32373..00000000 --- a/backend/epc_client/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -httpx==0.28.1 diff --git a/backend/epc_client/tests/test_client.py b/backend/epc_client/tests/test_client.py index 51dd2a12..7933f21d 100644 --- a/backend/epc_client/tests/test_client.py +++ b/backend/epc_client/tests/test_client.py @@ -1,7 +1,9 @@ from unittest.mock import MagicMock, patch, call import pytest -from backend.epc_client.client import EpcClientService, EpcSearchResult +from backend.epc_client.client import EpcClientService +from backend.utils.epc_address_match import find_best_epc_match +from datatypes.epc.search import EpcSearchResult from backend.epc_client.exceptions import EpcNotFoundError, EpcRateLimitError from datatypes.epc.domain.epc_property_data import EpcPropertyData from backend.epc_client.tests.conftest import make_search_row @@ -122,88 +124,51 @@ def test_search_by_postcode_404_returns_empty_list(epc_service): # --------------------------------------------------------------------------- -# Tests 8-10: find_best_match +# Tests 8-10: find_best_epc_match — real scoring, only HTTP mocked # --------------------------------------------------------------------------- -def _make_scored_df(rows, scores, ranks): - import pandas as pd - df = pd.DataFrame(rows) - df["lexiscore"] = scores - df["lexirank"] = ranks - return df.sort_values("lexirank") - - -def test_find_best_match_round1_clear_winner(epc_service, rdsap_21_0_1_cert): +def test_find_best_match_clear_winner_on_first_pass(epc_service, rdsap_21_0_1_cert): search_rows = [ make_search_row(cert_num="CERT-WIN", address_line_1="1 High Street"), make_search_row(cert_num="CERT-LOSE", address_line_1="99 Nowhere Lane"), ] cert_response = {"data": rdsap_21_0_1_cert} - df_rows = [ - {"address": "1 High Street", "uprn": "100023336956", "certificate_number": "CERT-WIN"}, - {"address": "99 Nowhere Lane", "uprn": "100023336956", "certificate_number": "CERT-LOSE"}, - ] - scored = _make_scored_df(df_rows, [0.9, 0.1], [1, 2]) - def fake_get(url, params=None, **kwargs): if "search" in url: return _mock_response(200, {"data": search_rows}) return _mock_response(200, cert_response) - with patch("httpx.get", side_effect=fake_get), \ - patch("backend.utils.addressMatch.get_uprn_candidates", return_value=scored): - result = epc_service.find_best_match("SW1A 1AA", "1 High Street") + with patch("httpx.get", side_effect=fake_get): + result = find_best_epc_match(epc_service, "SW1A 1AA", "1 High Street") assert isinstance(result, EpcPropertyData) -def test_find_best_match_round1_ambiguous_round2_resolves(epc_service, rdsap_21_0_1_cert): +def test_find_best_match_resolves_on_second_pass_using_full_address(epc_service, rdsap_21_0_1_cert): + # Both candidates share address_line_1 — round 1 is ambiguous. + # Round 2 scores against full_address and picks the correct floor. search_rows = [ make_search_row( - cert_num="CERT-A", address_line_1="1 High Street", + cert_num="CERT-A", + address_line_1="1 High Street", address_line_2="Ground Floor", ), make_search_row( - cert_num="CERT-B", address_line_1="1 High Street", + cert_num="CERT-B", + address_line_1="1 High Street", address_line_2="First Floor", ), ] cert_response = {"data": rdsap_21_0_1_cert} - # Round 1: both score equally — ambiguous (two rank-1s) - ambiguous = _make_scored_df( - [ - {"address": "1 High Street", "uprn": "111", "certificate_number": "CERT-A"}, - {"address": "1 High Street", "uprn": "222", "certificate_number": "CERT-B"}, - ], - [0.9, 0.9], - [1, 1], - ) - # Round 2: CERT-A wins on full address - resolved = _make_scored_df( - [ - {"address": "1 High Street, Ground Floor", "uprn": "111", "certificate_number": "CERT-A"}, - {"address": "1 High Street, First Floor", "uprn": "222", "certificate_number": "CERT-B"}, - ], - [0.85, 0.4], - [1, 2], - ) - - call_count = {"n": 0} - - def fake_candidates(df, user_address, **kwargs): - call_count["n"] += 1 - return ambiguous if call_count["n"] == 1 else resolved - def fake_get(url, params=None, **kwargs): if "search" in url: return _mock_response(200, {"data": search_rows}) return _mock_response(200, cert_response) - with patch("httpx.get", side_effect=fake_get), \ - patch("backend.utils.addressMatch.get_uprn_candidates", side_effect=fake_candidates): - result = epc_service.find_best_match("SW1A 1AA", "1 High Street Ground Floor") + with patch("httpx.get", side_effect=fake_get): + result = find_best_epc_match(epc_service, "SW1A 1AA", "1 High Street Ground Floor") assert isinstance(result, EpcPropertyData) @@ -211,14 +176,7 @@ def test_find_best_match_round1_ambiguous_round2_resolves(epc_service, rdsap_21_ def test_find_best_match_returns_none_when_no_good_match(epc_service): search_rows = [make_search_row(cert_num="CERT-X", address_line_1="99 Nowhere Lane")] - low_score = _make_scored_df( - [{"address": "99 Nowhere Lane", "uprn": "111", "certificate_number": "CERT-X"}], - [0.1], - [1], - ) - - with patch("httpx.get", return_value=_mock_response(200, {"data": search_rows})), \ - patch("backend.utils.addressMatch.get_uprn_candidates", return_value=low_score): - result = epc_service.find_best_match("SW1A 1AA", "1 Completely Different Road") + with patch("httpx.get", return_value=_mock_response(200, {"data": search_rows})): + result = find_best_epc_match(epc_service, "SW1A 1AA", "1 Completely Different Road") assert result is None diff --git a/backend/tests/test_address_match.py b/backend/tests/test_address_match.py new file mode 100644 index 00000000..f6a564df --- /dev/null +++ b/backend/tests/test_address_match.py @@ -0,0 +1,60 @@ +from backend.utils.addressMatch import AddressMatch + + +class TestNormaliseAddress: + def test_lowercases_input(self): + assert AddressMatch.normalise_address("1 HIGH STREET") == "1 high street" + + def test_expands_road_abbreviation(self): + assert AddressMatch.normalise_address("1 Moreton Rd") == "1 moreton road" + + def test_expands_avenue_abbreviation(self): + assert AddressMatch.normalise_address("2 Park Ave") == "2 park avenue" + + def test_removes_punctuation_keeps_slash(self): + result = AddressMatch.normalise_address("Flat 1/A, Some Road") + assert "," not in result + assert "/" in result + + def test_splits_digit_letter_suffix(self): + assert "42 a" in AddressMatch.normalise_address("42a Some Road") + + def test_empty_string_returns_empty(self): + assert AddressMatch.normalise_address("") == "" + + def test_removes_no_prefix(self): + result = AddressMatch.normalise_address("No 5 High Street") + assert "no" not in result.split() + assert "5" in result + + +class TestScore: + def test_identical_address_scores_one(self): + assert AddressMatch.score("1 High Street", "1 High Street") == 1.0 + + def test_case_insensitive(self): + assert AddressMatch.score("1 HIGH STREET", "1 high street") == 1.0 + + def test_street_type_synonym_scores_one(self): + # "Rd" expands to "road" during normalisation — should be identical + assert AddressMatch.score("1 High Rd", "1 High Road") == 1.0 + + def test_different_building_numbers_score_zero(self): + assert AddressMatch.score("1 High Street", "2 High Street") == 0.0 + + def test_disjoint_number_sets_score_zero(self): + assert AddressMatch.score("1 High Street", "99 Nowhere Lane") == 0.0 + + def test_user_address_has_number_but_epc_does_not_scores_zero(self): + assert AddressMatch.score("1 High Street", "High Street") == 0.0 + + def test_partial_address_scores_above_threshold(self): + # Extra token in user address ("London") — same building number, high overlap + score = AddressMatch.score("1 High Street London", "1 High Street") + assert 0.6 <= score < 1.0 + + def test_flat_number_mismatch_scores_zero(self): + # User has two numbers but no "flat" token; EPC has different flat number + # Triggers the order-sensitive flat guard + score = AddressMatch.score("3 42 High Street", "Flat 7 42 High Street") + assert score == 0.0 diff --git a/backend/utils/addressMatch.py b/backend/utils/addressMatch.py index 12c1ac53..a0c6ebdf 100644 --- a/backend/utils/addressMatch.py +++ b/backend/utils/addressMatch.py @@ -1,8 +1,13 @@ +from __future__ import annotations + import re -from typing import Any, Optional from difflib import SequenceMatcher +from typing import TYPE_CHECKING, Any, Optional + import requests -import pandas as pd + +if TYPE_CHECKING: + import pandas as pd class AddressMatch: diff --git a/backend/utils/epc_address_match.py b/backend/utils/epc_address_match.py new file mode 100644 index 00000000..f73d6d1d --- /dev/null +++ b/backend/utils/epc_address_match.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Optional + +from backend.utils.addressMatch import AddressMatch +from datatypes.epc.domain.epc_property_data import EpcPropertyData +from datatypes.epc.search import EpcSearchResult + +if TYPE_CHECKING: + from backend.epc_client.client import EpcClientService + +_MIN_MATCH_SCORE = 0.6 + + +def find_best_epc_match( + service: EpcClientService, + postcode: str, + address: str, +) -> Optional[EpcPropertyData]: + candidates = service.search_by_postcode(postcode) + if not candidates: + return None + + cert_num = _pick_best_cert(candidates, address, use_full_address=False) + if cert_num: + return _safe_get(service, cert_num) + + cert_num = _pick_best_cert(candidates, address, use_full_address=True) + if cert_num: + return _safe_get(service, cert_num) + + return None + + +def _pick_best_cert( + candidates: list[EpcSearchResult], + user_address: str, + use_full_address: bool, +) -> Optional[str]: + scored: list[tuple[float, str]] = [ + ( + AddressMatch.score( + user_address, + r.full_address if use_full_address else r.address_line_1, + ), + r.certificate_number, + ) + for r in candidates + ] + if not scored: + return None + best_score = max(s for s, _ in scored) + if best_score < _MIN_MATCH_SCORE: + return None + top = [cert for s, cert in scored if s == best_score] + if len(top) != 1: + return None + return top[0] + + +def _safe_get(service: EpcClientService, cert_num: str) -> Optional[EpcPropertyData]: + from backend.epc_client.exceptions import EpcNotFoundError + + try: + return service.get_by_certificate_number(cert_num) + except EpcNotFoundError: + return None diff --git a/datatypes/epc/search/__init__.py b/datatypes/epc/search/__init__.py new file mode 100644 index 00000000..3e08a56e --- /dev/null +++ b/datatypes/epc/search/__init__.py @@ -0,0 +1,3 @@ +from datatypes.epc.search.epc_search_result import EpcSearchResult + +__all__ = ["EpcSearchResult"] diff --git a/datatypes/epc/search/epc_search_result.py b/datatypes/epc/search/epc_search_result.py new file mode 100644 index 00000000..b6f47caf --- /dev/null +++ b/datatypes/epc/search/epc_search_result.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Optional + + +@dataclass +class EpcSearchResult: + certificate_number: str + address_line_1: str + address_line_2: Optional[str] + address_line_3: Optional[str] + address_line_4: Optional[str] + postcode: str + post_town: str + uprn: Optional[int] + current_energy_efficiency_band: str + registration_date: str + + @property + def full_address(self) -> str: + parts = [ + self.address_line_1, + self.address_line_2, + self.address_line_3, + self.address_line_4, + ] + return ", ".join(p for p in parts if p) diff --git a/pyproject.toml b/pyproject.toml index 72ec3f0c..49108861 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1 @@ [tool.pyright] -reportUnknownMemberType = false -reportUnknownVariableType = false \ No newline at end of file