fix: translate gov EPC API fuel codes to SAP10.2 Table 32 (v2.3.0)

predicted_total_fuel_cost_gbp was silently mispricing every non-gas
property because primary_main_fuel_type / water_heating_fuel store the
gov EPC API enum (26=mains gas, 27=LPG, 28=oil, 29=electricity) and our
_FUEL_UNIT_PRICE dict is keyed by Table 32 codes (1=gas, 4=oil, 30=elec).
Codes 26-29 hit the dict's default 3.48 p/kWh -- silently treating
electric immersion as gas.

Concrete impact on OX1 5LR Sep 2025 cert (worst-predicted SAP=41, model
84): water_heating_fuel=29 (electric immersion). Real DHW cost 2941 kWh
* 13.19p = £388/yr; we computed 2941 * 3.48 = £102 (4x under). Net
predicted_total_fuel_cost £292 vs implied real £2513 -- predicted_ecf
0.49 (~SAP 93) vs real ECF 4.24 (SAP 41).

Effect: every off-gas property's predicted_ecf was systematically too
low, dragging the model's catastrophic-low-SAP predictions toward
mid-band. Expected to substantially reduce decile-0 bias on retrain.

New _API_TO_TABLE32 map covers codes 0-29. 4 new AAA tests; VERSION
2.2.0 -> 2.3.0 (MINOR; behavioural fix to existing column values).
This commit is contained in:
Khalim Conn-Kowlessar 2026-05-17 17:02:21 +00:00
parent 4df1ee78b7
commit 696d43112e
4 changed files with 75 additions and 5 deletions

View file

@ -155,9 +155,55 @@ _FUEL_UNIT_PRICE: Final[dict[int, float]] = {
}
# Gov EPC API fuel enum -> SAP10.2 Table 32 fuel-code mapping. The cert
# stores the API code in primary_main_fuel_type / water_heating_fuel; our
# price dict above is keyed by Table 32. Without this translation, codes
# 26-29 (the modern "not community" main_fuel codes) hit the default and
# silently pretend to be mains gas.
_API_TO_TABLE32: Final[dict[int, int]] = {
0: 30, # No system -> use standard electricity
1: 1, # mains gas (legacy) -> mains gas
2: 2, # LPG (legacy) -> bulk LPG
3: 3, # bottled LPG
4: 4, # oil (legacy) -> heating oil
5: 15, # anthracite
6: 20, # wood logs
7: 23, # bulk wood pellets
8: 21, # wood chips
9: 10, # dual fuel (mineral + wood)
10: 30, # electricity (legacy) -> standard electricity
11: 42, # waste combustion -> heat recovered from waste
12: 43, # biomass -> HN biomass equivalent
13: 44, # biogas - landfill -> HN biogas
14: 11, # house coal
15: 12, # smokeless coal -> manufactured smokeless fuel
16: 22, # wood pellets (secondary)
17: 9, # LPG special condition
18: 75, # B30K
19: 76, # bioethanol
20: 51, # mains gas (community) -> HN boilers mains gas
21: 52, # LPG (community) -> HN boilers LPG
22: 53, # oil (community) -> HN boilers oil
23: 55, # B30D (community)
24: 54, # coal (community)
25: 41, # electricity (community) -> HN electric heat pump
26: 1, # mains gas (not community) -> mains gas
27: 2, # LPG (not community) -> bulk LPG
28: 4, # oil (not community) -> heating oil
29: 30, # electricity (not community)-> standard electricity
}
def fuel_unit_price_p_per_kwh(fuel_code: Optional[int]) -> float:
"""Table 32 unit price (p/kWh). Unknown -> mains gas (3.48 p/kWh),
the dominant UK heating fuel."""
"""Table 32 unit price (p/kWh). Accepts either a SAP10.2 Table 32 code
or a gov EPC API main_fuel/water_heating_fuel code (the cert's native
enum) and translates the latter via `_API_TO_TABLE32` before lookup.
Unknown -> mains gas (3.48 p/kWh), the dominant UK heating fuel."""
if fuel_code is None:
return 3.48
return _FUEL_UNIT_PRICE.get(fuel_code, 3.48)
if fuel_code in _FUEL_UNIT_PRICE:
return _FUEL_UNIT_PRICE[fuel_code]
table32_code = _API_TO_TABLE32.get(fuel_code)
if table32_code is not None:
return _FUEL_UNIT_PRICE.get(table32_code, 3.48)
return 3.48

View file

@ -165,3 +165,27 @@ def test_fuel_unit_price_unknown_falls_back_to_mains_gas() -> None:
# Assert — mains gas typical (most common UK heating fuel).
assert result == pytest.approx(3.48, abs=0.01)
# Gov EPC API uses a different fuel enum from SAP10.2 Table 32. The mapper
# stores the API codes in primary_main_fuel_type / water_heating_fuel so we
# must translate (e.g. API 29 = electricity not community -> Table 32 30).
def test_fuel_unit_price_recognises_api_code_26_mains_gas_not_community() -> None:
# Arrange / Act — gov API code 26 ("mains gas (not community)") -> Table 32 code 1 (3.48 p/kWh).
assert fuel_unit_price_p_per_kwh(fuel_code=26) == pytest.approx(3.48, abs=0.01)
def test_fuel_unit_price_recognises_api_code_28_oil_not_community() -> None:
# Arrange / Act — gov API code 28 = oil; should map to Table 32 oil (5.44 p/kWh).
assert fuel_unit_price_p_per_kwh(fuel_code=28) == pytest.approx(5.44, abs=0.01)
def test_fuel_unit_price_recognises_api_code_29_electricity_not_community() -> None:
# Arrange / Act — gov API code 29 = electricity; standard tariff 13.19 p/kWh.
assert fuel_unit_price_p_per_kwh(fuel_code=29) == pytest.approx(13.19, abs=0.01)
def test_fuel_unit_price_recognises_api_code_27_lpg_not_community() -> None:
# Arrange / Act — gov API code 27 = LPG not community -> bulk LPG 7.60 p/kWh.
assert fuel_unit_price_p_per_kwh(fuel_code=27) == pytest.approx(7.60, abs=0.01)

View file

@ -36,7 +36,7 @@ def test_transform_advertises_version_and_target_columns() -> None:
# Assert
assert isinstance(schema, TransformSchema)
assert schema.transform_version == "2.2.0"
assert schema.transform_version == "2.3.0"
assert schema.transform_version == EpcMlTransform.VERSION
assert set(schema.target_columns.keys()) == set(_EXPECTED_TARGET_DTYPES.keys())
for target_name, expected_dtype in _EXPECTED_TARGET_DTYPES.items():

View file

@ -901,7 +901,7 @@ class EpcMlTransform:
Version 0.1.0 schema contract only; feature columns added in subsequent slices.
"""
VERSION: str = "2.2.0"
VERSION: str = "2.3.0"
def schema(self) -> TransformSchema:
"""The cross-repo ML data contract.