Model/tests/domain/sap10_calculator/test_pcdb_etl.py
Khalim Conn-Kowlessar d7d5084f90 Move sap10_calculator tests to tests/domain/sap10_calculator/ for CI
The calculator tests lived under domain/sap10_calculator/{tests,worksheet/
tests,rdsap/tests,climate/tests,validation/tests}, none of which are in
pytest.ini testpaths — so CI (which collects tests/) never ran them. Relocate
all five dirs to tests/domain/sap10_calculator/{,worksheet,rdsap,climate,
validation}, mirroring the tests/domain/property_baseline/ convention, so the
cascade-pin / golden / e2e conformance suites run in CI.

Mechanics:
- git mv preserves history (110 files).
- Flattening the trailing /tests keeps each file's depth-to-repo-root
  identical, so all 16 repo-root parents[4] fixture refs stay valid. Only
  test_pcdb_etl.py's parents[1] (→ pcdb data) and one hardcoded absolute
  golden-fixture path in test_cert_to_inputs.py needed rebasing.
- Cross-imports rewritten domain.sap10_calculator.worksheet.tests →
  tests.domain.sap10_calculator.worksheet (21 files incl. the external
  importer backend/documents_parser/tests/test_summary_pdf_mapper_chain.py).
- Golden-fixture path strings in test_summary_pdf_mapper_chain.py +
  scripts/fetch_cohort2_api_jsons.py updated to the new location (the JSONs
  moved with the rdsap tests).

load_cells / gitignored worksheet xlsx: the xlsx-pinned tests (test_dimensions
/ ventilation / water_heating) read 2026-05-19-17-18 RdSap10Worksheet.xlsx,
which is gitignored (.gitignore `*.xlsx`) and so absent in CI. _xlsx_loader.
load_cells now pytest.skip()s when the file is absent, so those tests run
locally and skip cleanly in CI instead of erroring — no new CI failures from
the move, and the gitignore policy is respected.

Verified: tests/domain/sap10_calculator + backend/documents_parser +
tests/domain/property_baseline = 2248 pass, 1 skipped; pyright resolves the
new import paths with zero import-resolution errors.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-02 16:58:00 +00:00

327 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Tests for the BRE PCDB (pcdb10.dat) ETL parser.
The PCDB is a multi-table comma-separated data file published by BRE.
Each table has its own format (`$<table_id>,<format>,...`) and its own
field schema. This module verifies that the per-table parsers decode
records into typed dicts matching ground-truth records the user
verified against https://www.ncm-pcdb.org.uk.
Reference: BRE Product Characteristics Database — pcdb10.dat (April 2026).
"""
from __future__ import annotations
from pathlib import Path
import pytest
from domain.sap10_calculator.tables.pcdb.etl import run_etl
from domain.sap10_calculator.tables.pcdb.parser import (
parse_table_105,
parse_table_105_row,
parse_table_raw,
)
_PCDB_DAT_PATH: Path = (
Path(__file__).resolve().parents[3]
/ "domain" / "sap10_calculator" / "tables" / "pcdb" / "data" / "pcdb10.dat"
)
# Verified by user against ncm-pcdb.org.uk: Baxi Heating Wm 20/3rs.
_BAXI_98_RAW: str = (
"000098,000005,0,2010/Sep/13 17:03,Baxi Heating,Baxi Heating,Wm,20/3rs,"
"4107739,,1990,1,0,0,1,0,,,1,2,1,5.86,5.86,,,66.0,56.0,,40.8,,3,,,0,2,0,"
",,0,,0,,0,,,,,0,,,,,,,,,,,,,0000,,,,,,,,,,,,,,,"
)
# Verified by ground-truth arithmetic against PDF Σ(61) = 337.19 for 000474
# Elmhurst fixture (Vaillant ecoTEC pro 28 VUW GB 286/5-3, pcdb_id 16839):
# Table 3b row 1 → Σ(61) = (45) × r1 × fu + F1 × 365
# = 1680.84 × 0.0025 × 1.0 + 0.91251 × 365 = 337.27.
# Combi-loss fields (BRE PCDF Spec v1.0 §7.11 fields 48/51/52/56/57):
# separate_dhw_tests = 1 (one test, profile M → Table 3b)
# rejected_energy_proportion_r1 = 0.0025
# loss_factor_f1_kwh_per_day = 0.91251
# loss_factor_f2 / rejected_factor_f3 = blank (Table 3c not used)
_VAILLANT_16839_RAW: str = (
"016839,000031,0,2019/Mar/04 10:28,Vaillant,Vaillant,ecoTEC pro 28,"
"VUW GB 286/5-3,GC 47-044-45,2005,2015,1,2,1,2,0,,,2,2,2,24.4,24.4,,,"
"88.7,87.0,,75.1,,2,,,104,1,2,105,2,0,,,,0,,,,,1,7.012,0.133,0.0025,"
"0.91251,,,,,,1,1,,0045,,,,,,,,,89.0,98.0,,,,,96.3"
)
def test_table_105_parser_extracts_baxi_98_known_fields() -> None:
"""Decode the user-verified Baxi 000098 Wm 20/3rs record. Field positions
cross-checked against the ncm-pcdb.org.uk web entry: pcdb_id 98 = Baxi
Heating brand "Baxi Heating", model "Wm", qualifier "20/3rs", SAP winter
seasonal efficiency 66.0%, SAP summer seasonal efficiency 56.0%,
comparative hot water 40.8%, output 5.86 kW, final year 1990."""
# Arrange
raw_row = _BAXI_98_RAW
# Act
record = parse_table_105_row(raw_row)
# Assert
assert record.pcdb_id == 98
assert record.brand_name == "Baxi Heating"
assert record.model_name == "Wm"
assert record.model_qualifier == "20/3rs"
assert record.winter_efficiency_pct == 66.0
assert record.summer_efficiency_pct == 56.0
assert record.comparative_hot_water_efficiency_pct == 40.8
assert record.output_kw_max == 5.86
assert record.final_year_of_manufacture == 1990
# (raw_row, expected fields). Three additional user-verified records — same
# field positions, different manufacturers + output power + final year.
_POTTERTON_619_RAW: str = (
"000619,000034,0,2010/Sep/13 17:03,Potterton Myson,Potterton Myson,"
"Flamingo 2,cf20/30,4160516,,1986,1,0,0,1,0,,,1,1,1,8.8,8.8,,,66.0,56.0,"
",40.8,,3,,,0,2,0,,,0,,0,,0,,,,,0,,,,,,,,,,,,,0000,,,,,,,,,,,,,,,"
)
_SAUNIER_732_RAW: str = (
"000732,000035,0,2010/Sep/13 17:03,Saunier Duval,Saunier Duval,500,30c,"
"4192007,,1992,1,0,0,1,0,,,1,1,1,8.8,8.8,,,66.0,56.0,,40.8,,3,,,0,2,0,"
",,0,,0,,0,,,,,0,,,,,,,,,,,,,0000,,,,,,,,,,,,,,,"
)
@pytest.mark.parametrize(
"raw_row, expected",
[
(
_POTTERTON_619_RAW,
{
"pcdb_id": 619,
"brand_name": "Potterton Myson",
"model_name": "Flamingo 2",
"model_qualifier": "cf20/30",
"output_kw_max": 8.8,
"final_year_of_manufacture": 1986,
},
),
(
_SAUNIER_732_RAW,
{
"pcdb_id": 732,
"brand_name": "Saunier Duval",
"model_name": "500",
"model_qualifier": "30c",
"output_kw_max": 8.8,
"final_year_of_manufacture": 1992,
},
),
],
)
def test_table_105_parser_extracts_other_user_verified_records(
raw_row: str, expected: dict[str, object]
) -> None:
"""Confirms field positions hold across distinct manufacturers + output
powers + final years. All three records ship with the same 66/56/40.8
SAP-default efficiency — they're the same "estimated (ie SAP default)"
PCDB rows used to verify the parser's shape against ncm-pcdb.org.uk."""
# Arrange
# Act
record = parse_table_105_row(raw_row)
# Assert
for key, value in expected.items():
assert getattr(record, key) == value, f"field {key}"
def test_table_105_parser_extracts_separate_dhw_tests_profile_flag() -> None:
"""BRE PCDF Spec v1.0 §7.11 field 48 (0-indexed 47) "Separate DHW
tests" encodes the profile-flag for PCDB Table 3b/3c combi-loss
selection: 0 = none / not applicable, 1 = one test profile M
(Table 3b), 2 = two tests profiles M+L (Table 3c), 3 = two tests
profiles M+S (Table 3c). 16839 lodges flag=1 → Table 3b path."""
# Arrange
raw_row = _VAILLANT_16839_RAW
# Act
record = parse_table_105_row(raw_row)
# Assert
assert record.separate_dhw_tests == 1
def test_table_105_parser_extracts_table_3b_3c_combi_loss_coefficients() -> None:
"""BRE PCDF Spec v1.0 §7.11 fields 51 / 52 / 56 / 57 (0-indexed
50 / 51 / 55 / 56) carry the Table 3b/3c combi-loss coefficients:
rejected energy r1, loss factor F1 (Table 3b), loss factor F2
(Table 3c), rejected factor F3 (Table 3c, can be negative).
16839 lodges profile M only, so F2/F3 are absent (blank). Cross-
verified by arithmetic: Σ(61) = (45) × r1 × fu + F1 × 365
= 1680.84 × 0.0025 × 1.0 + 0.91251 × 365 = 337.27 kWh/yr against
the 000474 worksheet's PDF pin Σ(61) = 337.19 (Δ 0.02%)."""
# Arrange
raw_row = _VAILLANT_16839_RAW
# Act
record = parse_table_105_row(raw_row)
# Assert
assert record.rejected_energy_proportion_r1 == 0.0025
assert record.loss_factor_f1_kwh_per_day == 0.91251
assert record.loss_factor_f2_kwh_per_day is None
assert record.rejected_factor_f3_per_litre is None
def test_table_105_parser_leaves_combi_loss_fields_none_for_sap_default_boilers() -> None:
"""Baxi 000098 is a SAP-default boiler (no EN 13203-2 / OPS 26 tests),
so the Table 3b/3c combi-loss fields are blank in pcdb10.dat. The
parser exposes them as None to signal Table 3a fallback (the
pre-§4-HW default 600 kWh/yr behaviour)."""
# Arrange
raw_row = _BAXI_98_RAW
# Act
record = parse_table_105_row(raw_row)
# Assert
assert record.separate_dhw_tests == 0
assert record.rejected_energy_proportion_r1 is None
assert record.loss_factor_f1_kwh_per_day is None
assert record.loss_factor_f2_kwh_per_day is None
assert record.rejected_factor_f3_per_litre is None
def test_parse_table_105_walks_section_skipping_headers_and_comments() -> None:
"""The .dat file demarcates each table with a `$<id>,<format>,...`
header line, intersperses `#`-prefixed comments, and ends the table
with a `# ... end of Table <id>` marker before the next section. The
walker yields parsed records only for rows inside the Table 105
section, ignoring comments, headers, and rows from other tables."""
# Arrange
dat_section = (
"# noise before\n"
"$105,211,2,2025,11,28,2\n"
"# Table 105 (Gas and Oil Boilers) follows ...\n"
"#\n"
f"{_BAXI_98_RAW}\n"
f"{_POTTERTON_619_RAW}\n"
"#\n"
"# ... end of Table 105 Format 211\n"
"#\n"
"$362,360,1,2025,11,28,1\n"
"ignored,record,from,heat,pump,table\n"
)
# Act
records = parse_table_105(dat_section)
# Assert
assert [r.pcdb_id for r in records] == [98, 619]
assert records[0].brand_name == "Baxi Heating"
assert records[1].brand_name == "Potterton Myson"
def test_parse_table_105_extracts_user_verified_records_from_real_pcdb_dat() -> None:
"""End-to-end against the real BRE pcdb10.dat (7.9 MB, ~23k lines,
CRLF endings). Cross-references all four ground-truth records the user
verified against ncm-pcdb.org.uk — surfaces any drift between the
parser's field positions and real-world data."""
# Arrange — BRE PCDB ships in latin-1 (cp1252 superset; manufacturer
# addresses occasionally carry non-ASCII characters such as the degree
# sign).
dat_text = _PCDB_DAT_PATH.read_text(encoding="latin-1")
# Act
records = parse_table_105(dat_text)
by_id = {r.pcdb_id: r for r in records}
# Assert
assert by_id[98].brand_name == "Baxi Heating"
assert by_id[98].model_name == "Wm"
assert by_id[98].model_qualifier == "20/3rs"
assert by_id[98].winter_efficiency_pct == 66.0
assert by_id[98].summer_efficiency_pct == 56.0
assert by_id[98].comparative_hot_water_efficiency_pct == 40.8
assert by_id[98].final_year_of_manufacture == 1990
assert by_id[619].brand_name == "Potterton Myson"
assert by_id[619].winter_efficiency_pct == 66.0
assert by_id[732].brand_name == "Saunier Duval"
assert by_id[732].winter_efficiency_pct == 66.0
def test_run_etl_writes_table_105_jsonl_with_decoded_and_raw_fields(tmp_path: Path) -> None:
"""End-to-end ETL: read the real pcdb10.dat, parse Table 105, write a
newline-delimited JSON file (`.jsonl`). Each line is one record; reader
parses line-by-line. Verifies the decoded fields and that the raw row
is preserved alongside."""
# Arrange
import json
output_dir = tmp_path / "pcdb_json"
# Act
run_etl(source=_PCDB_DAT_PATH, output_dir=output_dir)
# Assert
table_105_jsonl = output_dir / "pcdb_table_105_gas_oil_boilers.jsonl"
assert table_105_jsonl.exists()
records = [
json.loads(line)
for line in table_105_jsonl.read_text().splitlines()
if line
]
by_id = {r["pcdb_id"]: r for r in records}
assert by_id[98]["brand_name"] == "Baxi Heating"
assert by_id[98]["winter_efficiency_pct"] == 66.0
assert by_id[98]["summer_efficiency_pct"] == 56.0
assert by_id[98]["raw"][0] == "000098" # raw[0] = pcdb_id (left-padded)
def test_parse_table_raw_extracts_heat_pump_records_from_real_pcdb_dat() -> None:
"""Generic positional walker against Table 362 (Heat Pumps). Per-field
typing is deferred to a future slice once heat-pump records are ground-
truth verified; for now the parser only commits to pcdb_id + raw row.
Asserts the walker handles a table other than 105 and produces non-
empty output with the expected shape."""
# Arrange
dat_text = _PCDB_DAT_PATH.read_text(encoding="latin-1")
# Act
records = parse_table_raw(dat_text, table_id="362")
# Assert
assert len(records) > 0
first = records[0]
assert isinstance(first.pcdb_id, int)
assert first.pcdb_id > 0
assert first.raw[0].lstrip("0") == str(first.pcdb_id) or first.raw[0] == "000000"
assert len(first.raw) > 1 # multi-field row
def test_run_etl_writes_all_pcdb_table_jsonl_files(tmp_path: Path) -> None:
"""Per the user-chosen scope-D ingestion: ETL produces JSONL for
every PCDB table of interest (105 typed; 322 typed via
`parse_table_322`; 122/143/313/353/362/391/506 as untyped pcdb_id
+ raw). Per-table typed refinement is the job of follow-up slices
when their cert-side wiring lands."""
# Arrange
expected_filenames = {
"pcdb_table_105_gas_oil_boilers.jsonl",
"pcdb_table_122_solid_fuel_boilers.jsonl",
"pcdb_table_143_micro_cogen.jsonl",
"pcdb_table_313_flue_gas_heat_recovery.jsonl",
"pcdb_table_322_decentralised_mev.jsonl",
"pcdb_table_329_mv_in_use_factors.jsonl",
"pcdb_table_353_waste_water_heat_recovery.jsonl",
"pcdb_table_362_heat_pumps.jsonl",
"pcdb_table_391_high_heat_retention_storage_heaters.jsonl",
"pcdb_table_506_heat_interface_units.jsonl",
}
output_dir = tmp_path / "pcdb_json"
# Act
run_etl(source=_PCDB_DAT_PATH, output_dir=output_dir)
# Assert
written = {p.name for p in output_dir.iterdir()}
assert expected_filenames.issubset(written)