diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index e49088f4..642733a7 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -24,6 +24,7 @@ from datatypes.epc.domain.historic_epc_matching import ( match_addresses_for_postcode, ) from backend.epc_client.client import EpcClientService +from datatypes.epc.domain.historic_epc_matching import ScoredHistoricEpc logger = setup_logger() @@ -64,7 +65,7 @@ def get_uprn_from_historic_epc( if not uprn or uprn == "nan": return None - top = result.top() + top: Optional[ScoredHistoricEpc] = result.top() if top is None: return None diff --git a/datatypes/epc/domain/tests/test_historic_epc_matching.py b/datatypes/epc/domain/tests/test_historic_epc_matching.py index 1c3ee6d4..ce86e5c0 100644 --- a/datatypes/epc/domain/tests/test_historic_epc_matching.py +++ b/datatypes/epc/domain/tests/test_historic_epc_matching.py @@ -1,3 +1,4 @@ +from typing import Optional from unittest.mock import patch import numpy as np @@ -13,40 +14,103 @@ from datatypes.epc.domain.historic_epc_matching import ( match_addresses_for_postcode, ) - # Columns required by the HistoricEpc dataclass (lower-cased CSV columns). # The matcher only reads ADDRESS + UPRN to score; everything else is filled # with "" but must be present for HistoricEpc(**kwargs) to construct. _FULL_COLUMN_FIELDS = [ - "LMK_KEY", "ADDRESS1", "ADDRESS2", "ADDRESS3", "POSTCODE", - "BUILDING_REFERENCE_NUMBER", "CURRENT_ENERGY_RATING", "POTENTIAL_ENERGY_RATING", - "CURRENT_ENERGY_EFFICIENCY", "POTENTIAL_ENERGY_EFFICIENCY", "PROPERTY_TYPE", - "BUILT_FORM", "INSPECTION_DATE", "LOCAL_AUTHORITY", "CONSTITUENCY", "COUNTY", - "LODGEMENT_DATE", "TRANSACTION_TYPE", "ENVIRONMENT_IMPACT_CURRENT", - "ENVIRONMENT_IMPACT_POTENTIAL", "ENERGY_CONSUMPTION_CURRENT", - "ENERGY_CONSUMPTION_POTENTIAL", "CO2_EMISSIONS_CURRENT", - "CO2_EMISS_CURR_PER_FLOOR_AREA", "CO2_EMISSIONS_POTENTIAL", - "LIGHTING_COST_CURRENT", "LIGHTING_COST_POTENTIAL", "HEATING_COST_CURRENT", - "HEATING_COST_POTENTIAL", "HOT_WATER_COST_CURRENT", "HOT_WATER_COST_POTENTIAL", - "TOTAL_FLOOR_AREA", "ENERGY_TARIFF", "MAINS_GAS_FLAG", "FLOOR_LEVEL", - "FLAT_TOP_STOREY", "FLAT_STOREY_COUNT", "MAIN_HEATING_CONTROLS", - "MULTI_GLAZE_PROPORTION", "GLAZED_TYPE", "GLAZED_AREA", "EXTENSION_COUNT", - "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS", "LOW_ENERGY_LIGHTING", - "NUMBER_OPEN_FIREPLACES", "HOTWATER_DESCRIPTION", "HOT_WATER_ENERGY_EFF", - "HOT_WATER_ENV_EFF", "FLOOR_DESCRIPTION", "FLOOR_ENERGY_EFF", "FLOOR_ENV_EFF", - "WINDOWS_DESCRIPTION", "WINDOWS_ENERGY_EFF", "WINDOWS_ENV_EFF", - "WALLS_DESCRIPTION", "WALLS_ENERGY_EFF", "WALLS_ENV_EFF", - "SECONDHEAT_DESCRIPTION", "SHEATING_ENERGY_EFF", "SHEATING_ENV_EFF", - "ROOF_DESCRIPTION", "ROOF_ENERGY_EFF", "ROOF_ENV_EFF", "MAINHEAT_DESCRIPTION", - "MAINHEAT_ENERGY_EFF", "MAINHEAT_ENV_EFF", "MAINHEATCONT_DESCRIPTION", - "MAINHEATC_ENERGY_EFF", "MAINHEATC_ENV_EFF", "LIGHTING_DESCRIPTION", - "LIGHTING_ENERGY_EFF", "LIGHTING_ENV_EFF", "MAIN_FUEL", "WIND_TURBINE_COUNT", - "HEAT_LOSS_CORRIDOR", "UNHEATED_CORRIDOR_LENGTH", "FLOOR_HEIGHT", - "PHOTO_SUPPLY", "SOLAR_WATER_HEATING_FLAG", "MECHANICAL_VENTILATION", - "ADDRESS", "LOCAL_AUTHORITY_LABEL", "CONSTITUENCY_LABEL", "POSTTOWN", - "CONSTRUCTION_AGE_BAND", "LODGEMENT_DATETIME", "TENURE", - "FIXED_LIGHTING_OUTLETS_COUNT", "LOW_ENERGY_FIXED_LIGHT_COUNT", "UPRN", - "UPRN_SOURCE", "REPORT_TYPE", + "LMK_KEY", + "ADDRESS1", + "ADDRESS2", + "ADDRESS3", + "POSTCODE", + "BUILDING_REFERENCE_NUMBER", + "CURRENT_ENERGY_RATING", + "POTENTIAL_ENERGY_RATING", + "CURRENT_ENERGY_EFFICIENCY", + "POTENTIAL_ENERGY_EFFICIENCY", + "PROPERTY_TYPE", + "BUILT_FORM", + "INSPECTION_DATE", + "LOCAL_AUTHORITY", + "CONSTITUENCY", + "COUNTY", + "LODGEMENT_DATE", + "TRANSACTION_TYPE", + "ENVIRONMENT_IMPACT_CURRENT", + "ENVIRONMENT_IMPACT_POTENTIAL", + "ENERGY_CONSUMPTION_CURRENT", + "ENERGY_CONSUMPTION_POTENTIAL", + "CO2_EMISSIONS_CURRENT", + "CO2_EMISS_CURR_PER_FLOOR_AREA", + "CO2_EMISSIONS_POTENTIAL", + "LIGHTING_COST_CURRENT", + "LIGHTING_COST_POTENTIAL", + "HEATING_COST_CURRENT", + "HEATING_COST_POTENTIAL", + "HOT_WATER_COST_CURRENT", + "HOT_WATER_COST_POTENTIAL", + "TOTAL_FLOOR_AREA", + "ENERGY_TARIFF", + "MAINS_GAS_FLAG", + "FLOOR_LEVEL", + "FLAT_TOP_STOREY", + "FLAT_STOREY_COUNT", + "MAIN_HEATING_CONTROLS", + "MULTI_GLAZE_PROPORTION", + "GLAZED_TYPE", + "GLAZED_AREA", + "EXTENSION_COUNT", + "NUMBER_HABITABLE_ROOMS", + "NUMBER_HEATED_ROOMS", + "LOW_ENERGY_LIGHTING", + "NUMBER_OPEN_FIREPLACES", + "HOTWATER_DESCRIPTION", + "HOT_WATER_ENERGY_EFF", + "HOT_WATER_ENV_EFF", + "FLOOR_DESCRIPTION", + "FLOOR_ENERGY_EFF", + "FLOOR_ENV_EFF", + "WINDOWS_DESCRIPTION", + "WINDOWS_ENERGY_EFF", + "WINDOWS_ENV_EFF", + "WALLS_DESCRIPTION", + "WALLS_ENERGY_EFF", + "WALLS_ENV_EFF", + "SECONDHEAT_DESCRIPTION", + "SHEATING_ENERGY_EFF", + "SHEATING_ENV_EFF", + "ROOF_DESCRIPTION", + "ROOF_ENERGY_EFF", + "ROOF_ENV_EFF", + "MAINHEAT_DESCRIPTION", + "MAINHEAT_ENERGY_EFF", + "MAINHEAT_ENV_EFF", + "MAINHEATCONT_DESCRIPTION", + "MAINHEATC_ENERGY_EFF", + "MAINHEATC_ENV_EFF", + "LIGHTING_DESCRIPTION", + "LIGHTING_ENERGY_EFF", + "LIGHTING_ENV_EFF", + "MAIN_FUEL", + "WIND_TURBINE_COUNT", + "HEAT_LOSS_CORRIDOR", + "UNHEATED_CORRIDOR_LENGTH", + "FLOOR_HEIGHT", + "PHOTO_SUPPLY", + "SOLAR_WATER_HEATING_FLAG", + "MECHANICAL_VENTILATION", + "ADDRESS", + "LOCAL_AUTHORITY_LABEL", + "CONSTITUENCY_LABEL", + "POSTTOWN", + "CONSTRUCTION_AGE_BAND", + "LODGEMENT_DATETIME", + "TENURE", + "FIXED_LIGHTING_OUTLETS_COUNT", + "LOW_ENERGY_FIXED_LIGHT_COUNT", + "UPRN", + "UPRN_SOURCE", + "REPORT_TYPE", ] @@ -63,7 +127,9 @@ def _build_df(rows: list[dict]) -> pd.DataFrame: @pytest.fixture def patch_postcode_valid(): - with patch.object(matcher_mod.AddressMatch, "is_valid_postcode", return_value=True) as m: + with patch.object( + matcher_mod.AddressMatch, "is_valid_postcode", return_value=True + ) as m: yield m @@ -106,10 +172,12 @@ class TestMatchAddressesForPostcode: self, patch_read, patch_postcode_valid ): # Disjoint number sets => hard zero. Still kept in matches. - patch_read.return_value = _build_df([ - _row("47 GORDON ROAD", "100"), - _row("999 SOMEWHERE ELSE", "200"), - ]) + patch_read.return_value = _build_df( + [ + _row("47 GORDON ROAD", "100"), + _row("999 SOMEWHERE ELSE", "200"), + ] + ) result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") assert isinstance(result, HistoricEpcMatches) assert len(result.matches) == 2 @@ -117,10 +185,12 @@ class TestMatchAddressesForPostcode: def test_top_has_lexirank_one_and_lexiscore_monotone( self, patch_read, patch_postcode_valid ): - patch_read.return_value = _build_df([ - _row("48 GORDON ROAD", "200"), # near miss - _row("47 GORDON ROAD", "100"), # exact (after normalisation) - ]) + patch_read.return_value = _build_df( + [ + _row("48 GORDON ROAD", "200"), # near miss + _row("47 GORDON ROAD", "100"), # exact (after normalisation) + ] + ) result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") assert result.top().lexirank == 1 scores = [m.lexiscore for m in result.matches] @@ -173,19 +243,23 @@ class TestMatchAddressesForPostcode: class TestUnambiguousUprn: def test_exact_match_returns_uprn(self, patch_read, patch_postcode_valid): - patch_read.return_value = _build_df([ - _row("47 GORDON ROAD", "100"), - _row("48 GORDON ROAD", "200"), - ]) + patch_read.return_value = _build_df( + [ + _row("47 GORDON ROAD", "100"), + _row("48 GORDON ROAD", "200"), + ] + ) result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") assert result.unambiguous_uprn() == "100" def test_ambiguous_tie_returns_none(self, patch_read, patch_postcode_valid): # Two duplicate addresses with different UPRNs share rank-1. - patch_read.return_value = _build_df([ - _row("47 GORDON ROAD", "100"), - _row("47 GORDON ROAD", "200"), - ]) + patch_read.return_value = _build_df( + [ + _row("47 GORDON ROAD", "100"), + _row("47 GORDON ROAD", "200"), + ] + ) result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") assert result.unambiguous_uprn() is None @@ -193,10 +267,12 @@ class TestUnambiguousUprn: self, patch_read, patch_postcode_valid ): # User address has building number 47; no row has 47 -> all hard-zero. - patch_read.return_value = _build_df([ - _row("999 ELSEWHERE", "100"), - _row("888 ELSEWHERE", "200"), - ]) + patch_read.return_value = _build_df( + [ + _row("999 ELSEWHERE", "100"), + _row("888 ELSEWHERE", "200"), + ] + ) result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") assert all(m.lexiscore == 0.0 for m in result.matches) assert result.unambiguous_uprn() is None @@ -205,15 +281,22 @@ class TestUnambiguousUprn: self, patch_read, patch_postcode_valid ): # Use a real NaN in the UPRN cell. - patch_read.return_value = _build_df([ - _row("47 GORDON ROAD", np.nan), - _row("48 GORDON ROAD", "200"), - ]) - result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") - top = result.top() + patch_read.return_value = _build_df( + [ + _row("47 GORDON ROAD", np.nan), + _row("48 GORDON ROAD", "200"), + ] + ) + result: HistoricEpcMatches = match_addresses_for_postcode( + "47 Gordon Road", "AB33 8AL" + ) + top: Optional[ScoredHistoricEpc] = result.top() # pandas_cell_to_str must turn NaN/"nan" into "" (not the literal string "nan"), # so unambiguous_uprn's truthiness check correctly drops the row. - assert top.record.uprn == "" + if top: + assert top.record.uprn == "" + else: + pytest.fail("should have an epc score, no results found :(") # ---------- top / top_n ---------- @@ -222,11 +305,13 @@ class TestUnambiguousUprn: class TestTopHelpers: def test_top_n_returns_first_k(self, patch_read, patch_postcode_valid): - patch_read.return_value = _build_df([ - _row("47 GORDON ROAD", "100"), - _row("48 GORDON ROAD", "200"), - _row("49 GORDON ROAD", "300"), - ]) + patch_read.return_value = _build_df( + [ + _row("47 GORDON ROAD", "100"), + _row("48 GORDON ROAD", "200"), + _row("49 GORDON ROAD", "300"), + ] + ) result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") top2 = result.top_n(2) assert len(top2) == 2