"""Tests for the BRE PCDB (pcdb10.dat) ETL parser. The PCDB is a multi-table comma-separated data file published by BRE. Each table has its own format (`$,,...`) and its own field schema. This module verifies that the per-table parsers decode records into typed dicts matching ground-truth records the user verified against https://www.ncm-pcdb.org.uk. Reference: BRE Product Characteristics Database — pcdb10.dat (April 2026). """ from __future__ import annotations from pathlib import Path import pytest from domain.sap10_calculator.tables.pcdb.etl import run_etl from domain.sap10_calculator.tables.pcdb.parser import ( parse_table_105, parse_table_105_row, parse_table_raw, ) _PCDB_DAT_PATH: Path = ( Path(__file__).resolve().parents[3] / "domain" / "sap10_calculator" / "tables" / "pcdb" / "data" / "pcdb10.dat" ) # Verified by user against ncm-pcdb.org.uk: Baxi Heating Wm 20/3rs. _BAXI_98_RAW: str = ( "000098,000005,0,2010/Sep/13 17:03,Baxi Heating,Baxi Heating,Wm,20/3rs," "4107739,,1990,1,0,0,1,0,,,1,2,1,5.86,5.86,,,66.0,56.0,,40.8,,3,,,0,2,0," ",,0,,0,,0,,,,,0,,,,,,,,,,,,,0000,,,,,,,,,,,,,,," ) # Verified by ground-truth arithmetic against PDF Σ(61) = 337.19 for 000474 # Elmhurst fixture (Vaillant ecoTEC pro 28 VUW GB 286/5-3, pcdb_id 16839): # Table 3b row 1 → Σ(61) = (45) × r1 × fu + F1 × 365 # = 1680.84 × 0.0025 × 1.0 + 0.91251 × 365 = 337.27. # Combi-loss fields (BRE PCDF Spec v1.0 §7.11 fields 48/51/52/56/57): # separate_dhw_tests = 1 (one test, profile M → Table 3b) # rejected_energy_proportion_r1 = 0.0025 # loss_factor_f1_kwh_per_day = 0.91251 # loss_factor_f2 / rejected_factor_f3 = blank (Table 3c not used) _VAILLANT_16839_RAW: str = ( "016839,000031,0,2019/Mar/04 10:28,Vaillant,Vaillant,ecoTEC pro 28," "VUW GB 286/5-3,GC 47-044-45,2005,2015,1,2,1,2,0,,,2,2,2,24.4,24.4,,," "88.7,87.0,,75.1,,2,,,104,1,2,105,2,0,,,,0,,,,,1,7.012,0.133,0.0025," "0.91251,,,,,,1,1,,0045,,,,,,,,,89.0,98.0,,,,,96.3" ) def test_table_105_parser_extracts_baxi_98_known_fields() -> None: """Decode the user-verified Baxi 000098 Wm 20/3rs record. Field positions cross-checked against the ncm-pcdb.org.uk web entry: pcdb_id 98 = Baxi Heating brand "Baxi Heating", model "Wm", qualifier "20/3rs", SAP winter seasonal efficiency 66.0%, SAP summer seasonal efficiency 56.0%, comparative hot water 40.8%, output 5.86 kW, final year 1990.""" # Arrange raw_row = _BAXI_98_RAW # Act record = parse_table_105_row(raw_row) # Assert assert record.pcdb_id == 98 assert record.brand_name == "Baxi Heating" assert record.model_name == "Wm" assert record.model_qualifier == "20/3rs" assert record.winter_efficiency_pct == 66.0 assert record.summer_efficiency_pct == 56.0 assert record.comparative_hot_water_efficiency_pct == 40.8 assert record.output_kw_max == 5.86 assert record.final_year_of_manufacture == 1990 # (raw_row, expected fields). Three additional user-verified records — same # field positions, different manufacturers + output power + final year. _POTTERTON_619_RAW: str = ( "000619,000034,0,2010/Sep/13 17:03,Potterton Myson,Potterton Myson," "Flamingo 2,cf20/30,4160516,,1986,1,0,0,1,0,,,1,1,1,8.8,8.8,,,66.0,56.0," ",40.8,,3,,,0,2,0,,,0,,0,,0,,,,,0,,,,,,,,,,,,,0000,,,,,,,,,,,,,,," ) _SAUNIER_732_RAW: str = ( "000732,000035,0,2010/Sep/13 17:03,Saunier Duval,Saunier Duval,500,30c," "4192007,,1992,1,0,0,1,0,,,1,1,1,8.8,8.8,,,66.0,56.0,,40.8,,3,,,0,2,0," ",,0,,0,,0,,,,,0,,,,,,,,,,,,,0000,,,,,,,,,,,,,,," ) @pytest.mark.parametrize( "raw_row, expected", [ ( _POTTERTON_619_RAW, { "pcdb_id": 619, "brand_name": "Potterton Myson", "model_name": "Flamingo 2", "model_qualifier": "cf20/30", "output_kw_max": 8.8, "final_year_of_manufacture": 1986, }, ), ( _SAUNIER_732_RAW, { "pcdb_id": 732, "brand_name": "Saunier Duval", "model_name": "500", "model_qualifier": "30c", "output_kw_max": 8.8, "final_year_of_manufacture": 1992, }, ), ], ) def test_table_105_parser_extracts_other_user_verified_records( raw_row: str, expected: dict[str, object] ) -> None: """Confirms field positions hold across distinct manufacturers + output powers + final years. All three records ship with the same 66/56/40.8 SAP-default efficiency — they're the same "estimated (ie SAP default)" PCDB rows used to verify the parser's shape against ncm-pcdb.org.uk.""" # Arrange # Act record = parse_table_105_row(raw_row) # Assert for key, value in expected.items(): assert getattr(record, key) == value, f"field {key}" def test_table_105_parser_extracts_separate_dhw_tests_profile_flag() -> None: """BRE PCDF Spec v1.0 §7.11 field 48 (0-indexed 47) "Separate DHW tests" encodes the profile-flag for PCDB Table 3b/3c combi-loss selection: 0 = none / not applicable, 1 = one test profile M (Table 3b), 2 = two tests profiles M+L (Table 3c), 3 = two tests profiles M+S (Table 3c). 16839 lodges flag=1 → Table 3b path.""" # Arrange raw_row = _VAILLANT_16839_RAW # Act record = parse_table_105_row(raw_row) # Assert assert record.separate_dhw_tests == 1 def test_table_105_parser_extracts_table_3b_3c_combi_loss_coefficients() -> None: """BRE PCDF Spec v1.0 §7.11 fields 51 / 52 / 56 / 57 (0-indexed 50 / 51 / 55 / 56) carry the Table 3b/3c combi-loss coefficients: rejected energy r1, loss factor F1 (Table 3b), loss factor F2 (Table 3c), rejected factor F3 (Table 3c, can be negative). 16839 lodges profile M only, so F2/F3 are absent (blank). Cross- verified by arithmetic: Σ(61) = (45) × r1 × fu + F1 × 365 = 1680.84 × 0.0025 × 1.0 + 0.91251 × 365 = 337.27 kWh/yr against the 000474 worksheet's PDF pin Σ(61) = 337.19 (Δ 0.02%).""" # Arrange raw_row = _VAILLANT_16839_RAW # Act record = parse_table_105_row(raw_row) # Assert assert record.rejected_energy_proportion_r1 == 0.0025 assert record.loss_factor_f1_kwh_per_day == 0.91251 assert record.loss_factor_f2_kwh_per_day is None assert record.rejected_factor_f3_per_litre is None def test_table_105_parser_leaves_combi_loss_fields_none_for_sap_default_boilers() -> None: """Baxi 000098 is a SAP-default boiler (no EN 13203-2 / OPS 26 tests), so the Table 3b/3c combi-loss fields are blank in pcdb10.dat. The parser exposes them as None to signal Table 3a fallback (the pre-§4-HW default 600 kWh/yr behaviour).""" # Arrange raw_row = _BAXI_98_RAW # Act record = parse_table_105_row(raw_row) # Assert assert record.separate_dhw_tests == 0 assert record.rejected_energy_proportion_r1 is None assert record.loss_factor_f1_kwh_per_day is None assert record.loss_factor_f2_kwh_per_day is None assert record.rejected_factor_f3_per_litre is None def test_parse_table_105_walks_section_skipping_headers_and_comments() -> None: """The .dat file demarcates each table with a `$,,...` header line, intersperses `#`-prefixed comments, and ends the table with a `# ... end of Table ` marker before the next section. The walker yields parsed records only for rows inside the Table 105 section, ignoring comments, headers, and rows from other tables.""" # Arrange dat_section = ( "# noise before\n" "$105,211,2,2025,11,28,2\n" "# Table 105 (Gas and Oil Boilers) follows ...\n" "#\n" f"{_BAXI_98_RAW}\n" f"{_POTTERTON_619_RAW}\n" "#\n" "# ... end of Table 105 Format 211\n" "#\n" "$362,360,1,2025,11,28,1\n" "ignored,record,from,heat,pump,table\n" ) # Act records = parse_table_105(dat_section) # Assert assert [r.pcdb_id for r in records] == [98, 619] assert records[0].brand_name == "Baxi Heating" assert records[1].brand_name == "Potterton Myson" def test_parse_table_105_extracts_user_verified_records_from_real_pcdb_dat() -> None: """End-to-end against the real BRE pcdb10.dat (7.9 MB, ~23k lines, CRLF endings). Cross-references all four ground-truth records the user verified against ncm-pcdb.org.uk — surfaces any drift between the parser's field positions and real-world data.""" # Arrange — BRE PCDB ships in latin-1 (cp1252 superset; manufacturer # addresses occasionally carry non-ASCII characters such as the degree # sign). dat_text = _PCDB_DAT_PATH.read_text(encoding="latin-1") # Act records = parse_table_105(dat_text) by_id = {r.pcdb_id: r for r in records} # Assert assert by_id[98].brand_name == "Baxi Heating" assert by_id[98].model_name == "Wm" assert by_id[98].model_qualifier == "20/3rs" assert by_id[98].winter_efficiency_pct == 66.0 assert by_id[98].summer_efficiency_pct == 56.0 assert by_id[98].comparative_hot_water_efficiency_pct == 40.8 assert by_id[98].final_year_of_manufacture == 1990 assert by_id[619].brand_name == "Potterton Myson" assert by_id[619].winter_efficiency_pct == 66.0 assert by_id[732].brand_name == "Saunier Duval" assert by_id[732].winter_efficiency_pct == 66.0 def test_run_etl_writes_table_105_jsonl_with_decoded_and_raw_fields(tmp_path: Path) -> None: """End-to-end ETL: read the real pcdb10.dat, parse Table 105, write a newline-delimited JSON file (`.jsonl`). Each line is one record; reader parses line-by-line. Verifies the decoded fields and that the raw row is preserved alongside.""" # Arrange import json output_dir = tmp_path / "pcdb_json" # Act run_etl(source=_PCDB_DAT_PATH, output_dir=output_dir) # Assert table_105_jsonl = output_dir / "pcdb_table_105_gas_oil_boilers.jsonl" assert table_105_jsonl.exists() records = [ json.loads(line) for line in table_105_jsonl.read_text().splitlines() if line ] by_id = {r["pcdb_id"]: r for r in records} assert by_id[98]["brand_name"] == "Baxi Heating" assert by_id[98]["winter_efficiency_pct"] == 66.0 assert by_id[98]["summer_efficiency_pct"] == 56.0 assert by_id[98]["raw"][0] == "000098" # raw[0] = pcdb_id (left-padded) def test_parse_table_raw_extracts_heat_pump_records_from_real_pcdb_dat() -> None: """Generic positional walker against Table 362 (Heat Pumps). Per-field typing is deferred to a future slice once heat-pump records are ground- truth verified; for now the parser only commits to pcdb_id + raw row. Asserts the walker handles a table other than 105 and produces non- empty output with the expected shape.""" # Arrange dat_text = _PCDB_DAT_PATH.read_text(encoding="latin-1") # Act records = parse_table_raw(dat_text, table_id="362") # Assert assert len(records) > 0 first = records[0] assert isinstance(first.pcdb_id, int) assert first.pcdb_id > 0 assert first.raw[0].lstrip("0") == str(first.pcdb_id) or first.raw[0] == "000000" assert len(first.raw) > 1 # multi-field row def test_run_etl_writes_all_pcdb_table_jsonl_files(tmp_path: Path) -> None: """Per the user-chosen scope-D ingestion: ETL produces JSONL for every PCDB table of interest (105 typed; 322 typed via `parse_table_322`; 122/143/313/353/362/391/506 as untyped pcdb_id + raw). Per-table typed refinement is the job of follow-up slices when their cert-side wiring lands.""" # Arrange expected_filenames = { "pcdb_table_105_gas_oil_boilers.jsonl", "pcdb_table_122_solid_fuel_boilers.jsonl", "pcdb_table_143_micro_cogen.jsonl", "pcdb_table_313_flue_gas_heat_recovery.jsonl", "pcdb_table_322_decentralised_mev.jsonl", "pcdb_table_329_mv_in_use_factors.jsonl", "pcdb_table_353_waste_water_heat_recovery.jsonl", "pcdb_table_362_heat_pumps.jsonl", "pcdb_table_391_high_heat_retention_storage_heaters.jsonl", "pcdb_table_506_heat_interface_units.jsonl", } output_dir = tmp_path / "pcdb_json" # Act run_etl(source=_PCDB_DAT_PATH, output_dir=output_dir) # Assert written = {p.name for p in output_dir.iterdir()} assert expected_filenames.issubset(written)