Model/tests/domain/sap10_calculator/test_pcdb_etl.py

"""Tests for the BRE PCDB (pcdb10.dat) ETL parser.

The PCDB is a multi-table comma-separated data file published by BRE.
Each table has its own format (`$<table_id>,<format>,...`) and its own
field schema. This module verifies that the per-table parsers decode
records into typed dicts matching ground-truth records the user
verified against https://www.ncm-pcdb.org.uk.

Reference: BRE Product Characteristics Database — pcdb10.dat (April 2026).
"""

from __future__ import annotations

from pathlib import Path

import pytest

from domain.sap10_calculator.tables.pcdb.etl import run_etl
from domain.sap10_calculator.tables.pcdb.parser import (
    parse_table_105,
    parse_table_105_row,
    parse_table_raw,
)


_PCDB_DAT_PATH: Path = (
    Path(__file__).resolve().parents[3]
    / "domain" / "sap10_calculator" / "tables" / "pcdb" / "data" / "pcdb10.dat"
)


# Verified by user against ncm-pcdb.org.uk: Baxi Heating Wm 20/3rs.
_BAXI_98_RAW: str = (
    "000098,000005,0,2010/Sep/13 17:03,Baxi Heating,Baxi Heating,Wm,20/3rs,"
    "4107739,,1990,1,0,0,1,0,,,1,2,1,5.86,5.86,,,66.0,56.0,,40.8,,3,,,0,2,0,"
    ",,0,,0,,0,,,,,0,,,,,,,,,,,,,0000,,,,,,,,,,,,,,,"
)

# Verified by ground-truth arithmetic against PDF Σ(61) = 337.19 for 000474
# Elmhurst fixture (Vaillant ecoTEC pro 28 VUW GB 286/5-3, pcdb_id 16839):
# Table 3b row 1 → Σ(61) = (45) × r1 × fu + F1 × 365
#                       = 1680.84 × 0.0025 × 1.0 + 0.91251 × 365 = 337.27.
# Combi-loss fields (BRE PCDF Spec v1.0 §7.11 fields 48/51/52/56/57):
#   separate_dhw_tests = 1 (one test, profile M → Table 3b)
#   rejected_energy_proportion_r1 = 0.0025
#   loss_factor_f1_kwh_per_day    = 0.91251
#   loss_factor_f2 / rejected_factor_f3 = blank (Table 3c not used)
_VAILLANT_16839_RAW: str = (
    "016839,000031,0,2019/Mar/04 10:28,Vaillant,Vaillant,ecoTEC pro 28,"
    "VUW GB 286/5-3,GC 47-044-45,2005,2015,1,2,1,2,0,,,2,2,2,24.4,24.4,,,"
    "88.7,87.0,,75.1,,2,,,104,1,2,105,2,0,,,,0,,,,,1,7.012,0.133,0.0025,"
    "0.91251,,,,,,1,1,,0045,,,,,,,,,89.0,98.0,,,,,96.3"
)


def test_table_105_parser_extracts_baxi_98_known_fields() -> None:
    """Decode the user-verified Baxi 000098 Wm 20/3rs record. Field positions
    cross-checked against the ncm-pcdb.org.uk web entry: pcdb_id 98 = Baxi
    Heating brand "Baxi Heating", model "Wm", qualifier "20/3rs", SAP winter
    seasonal efficiency 66.0%, SAP summer seasonal efficiency 56.0%,
    comparative hot water 40.8%, output 5.86 kW, final year 1990."""
    # Arrange
    raw_row = _BAXI_98_RAW

    # Act
    record = parse_table_105_row(raw_row)

    # Assert
    assert record.pcdb_id == 98
    assert record.brand_name == "Baxi Heating"
    assert record.model_name == "Wm"
    assert record.model_qualifier == "20/3rs"
    assert record.winter_efficiency_pct == 66.0
    assert record.summer_efficiency_pct == 56.0
    assert record.comparative_hot_water_efficiency_pct == 40.8
    assert record.output_kw_max == 5.86
    assert record.final_year_of_manufacture == 1990


# (raw_row, expected fields). Three additional user-verified records — same
# field positions, different manufacturers + output power + final year.
_POTTERTON_619_RAW: str = (
    "000619,000034,0,2010/Sep/13 17:03,Potterton Myson,Potterton Myson,"
    "Flamingo 2,cf20/30,4160516,,1986,1,0,0,1,0,,,1,1,1,8.8,8.8,,,66.0,56.0,"
    ",40.8,,3,,,0,2,0,,,0,,0,,0,,,,,0,,,,,,,,,,,,,0000,,,,,,,,,,,,,,,"
)
_SAUNIER_732_RAW: str = (
    "000732,000035,0,2010/Sep/13 17:03,Saunier Duval,Saunier Duval,500,30c,"
    "4192007,,1992,1,0,0,1,0,,,1,1,1,8.8,8.8,,,66.0,56.0,,40.8,,3,,,0,2,0,"
    ",,0,,0,,0,,,,,0,,,,,,,,,,,,,0000,,,,,,,,,,,,,,,"
)


@pytest.mark.parametrize(
    "raw_row, expected",
    [
        (
            _POTTERTON_619_RAW,
            {
                "pcdb_id": 619,
                "brand_name": "Potterton Myson",
                "model_name": "Flamingo 2",
                "model_qualifier": "cf20/30",
                "output_kw_max": 8.8,
                "final_year_of_manufacture": 1986,
            },
        ),
        (
            _SAUNIER_732_RAW,
            {
                "pcdb_id": 732,
                "brand_name": "Saunier Duval",
                "model_name": "500",
                "model_qualifier": "30c",
                "output_kw_max": 8.8,
                "final_year_of_manufacture": 1992,
            },
        ),
    ],
)
def test_table_105_parser_extracts_other_user_verified_records(
    raw_row: str, expected: dict[str, object]
) -> None:
    """Confirms field positions hold across distinct manufacturers + output
    powers + final years. All three records ship with the same 66/56/40.8
    SAP-default efficiency — they're the same "estimated (ie SAP default)"
    PCDB rows used to verify the parser's shape against ncm-pcdb.org.uk."""
    # Arrange
    # Act
    record = parse_table_105_row(raw_row)

    # Assert
    for key, value in expected.items():
        assert getattr(record, key) == value, f"field {key}"


def test_table_105_parser_extracts_separate_dhw_tests_profile_flag() -> None:
    """BRE PCDF Spec v1.0 §7.11 field 48 (0-indexed 47) "Separate DHW
    tests" encodes the profile-flag for PCDB Table 3b/3c combi-loss
    selection: 0 = none / not applicable, 1 = one test profile M
    (Table 3b), 2 = two tests profiles M+L (Table 3c), 3 = two tests
    profiles M+S (Table 3c). 16839 lodges flag=1 → Table 3b path."""
    # Arrange
    raw_row = _VAILLANT_16839_RAW

    # Act
    record = parse_table_105_row(raw_row)

    # Assert
    assert record.separate_dhw_tests == 1


def test_table_105_parser_extracts_table_3b_3c_combi_loss_coefficients() -> None:
    """BRE PCDF Spec v1.0 §7.11 fields 51 / 52 / 56 / 57 (0-indexed
    50 / 51 / 55 / 56) carry the Table 3b/3c combi-loss coefficients:
    rejected energy r1, loss factor F1 (Table 3b), loss factor F2
    (Table 3c), rejected factor F3 (Table 3c, can be negative).
    16839 lodges profile M only, so F2/F3 are absent (blank). Cross-
    verified by arithmetic: Σ(61) = (45) × r1 × fu + F1 × 365
    = 1680.84 × 0.0025 × 1.0 + 0.91251 × 365 = 337.27 kWh/yr against
    the 000474 worksheet's PDF pin Σ(61) = 337.19 (Δ 0.02%)."""
    # Arrange
    raw_row = _VAILLANT_16839_RAW

    # Act
    record = parse_table_105_row(raw_row)

    # Assert
    assert record.rejected_energy_proportion_r1 == 0.0025
    assert record.loss_factor_f1_kwh_per_day == 0.91251
    assert record.loss_factor_f2_kwh_per_day is None
    assert record.rejected_factor_f3_per_litre is None


def test_table_105_parser_leaves_combi_loss_fields_none_for_sap_default_boilers() -> None:
    """Baxi 000098 is a SAP-default boiler (no EN 13203-2 / OPS 26 tests),
    so the Table 3b/3c combi-loss fields are blank in pcdb10.dat. The
    parser exposes them as None to signal Table 3a fallback (the
    pre-§4-HW default 600 kWh/yr behaviour)."""
    # Arrange
    raw_row = _BAXI_98_RAW

    # Act
    record = parse_table_105_row(raw_row)

    # Assert
    assert record.separate_dhw_tests == 0
    assert record.rejected_energy_proportion_r1 is None
    assert record.loss_factor_f1_kwh_per_day is None
    assert record.loss_factor_f2_kwh_per_day is None
    assert record.rejected_factor_f3_per_litre is None


def test_parse_table_105_walks_section_skipping_headers_and_comments() -> None:
    """The .dat file demarcates each table with a `$<id>,<format>,...`
    header line, intersperses `#`-prefixed comments, and ends the table
    with a `# ... end of Table <id>` marker before the next section. The
    walker yields parsed records only for rows inside the Table 105
    section, ignoring comments, headers, and rows from other tables."""
    # Arrange
    dat_section = (
        "# noise before\n"
        "$105,211,2,2025,11,28,2\n"
        "# Table 105 (Gas and Oil Boilers) follows ...\n"
        "#\n"
        f"{_BAXI_98_RAW}\n"
        f"{_POTTERTON_619_RAW}\n"
        "#\n"
        "# ... end of Table 105 Format 211\n"
        "#\n"
        "$362,360,1,2025,11,28,1\n"
        "ignored,record,from,heat,pump,table\n"
    )

    # Act
    records = parse_table_105(dat_section)

    # Assert
    assert [r.pcdb_id for r in records] == [98, 619]
    assert records[0].brand_name == "Baxi Heating"
    assert records[1].brand_name == "Potterton Myson"


def test_parse_table_105_extracts_user_verified_records_from_real_pcdb_dat() -> None:
    """End-to-end against the real BRE pcdb10.dat (7.9 MB, ~23k lines,
    CRLF endings). Cross-references all four ground-truth records the user
    verified against ncm-pcdb.org.uk — surfaces any drift between the
    parser's field positions and real-world data."""
    # Arrange — BRE PCDB ships in latin-1 (cp1252 superset; manufacturer
    # addresses occasionally carry non-ASCII characters such as the degree
    # sign).
    dat_text = _PCDB_DAT_PATH.read_text(encoding="latin-1")

    # Act
    records = parse_table_105(dat_text)
    by_id = {r.pcdb_id: r for r in records}

    # Assert
    assert by_id[98].brand_name == "Baxi Heating"
    assert by_id[98].model_name == "Wm"
    assert by_id[98].model_qualifier == "20/3rs"
    assert by_id[98].winter_efficiency_pct == 66.0
    assert by_id[98].summer_efficiency_pct == 56.0
    assert by_id[98].comparative_hot_water_efficiency_pct == 40.8
    assert by_id[98].final_year_of_manufacture == 1990
    assert by_id[619].brand_name == "Potterton Myson"
    assert by_id[619].winter_efficiency_pct == 66.0
    assert by_id[732].brand_name == "Saunier Duval"
    assert by_id[732].winter_efficiency_pct == 66.0


def test_run_etl_writes_table_105_jsonl_with_decoded_and_raw_fields(tmp_path: Path) -> None:
    """End-to-end ETL: read the real pcdb10.dat, parse Table 105, write a
    newline-delimited JSON file (`.jsonl`). Each line is one record; reader
    parses line-by-line. Verifies the decoded fields and that the raw row
    is preserved alongside."""
    # Arrange
    import json

    output_dir = tmp_path / "pcdb_json"

    # Act
    run_etl(source=_PCDB_DAT_PATH, output_dir=output_dir)

    # Assert
    table_105_jsonl = output_dir / "pcdb_table_105_gas_oil_boilers.jsonl"
    assert table_105_jsonl.exists()
    records = [
        json.loads(line)
        for line in table_105_jsonl.read_text().splitlines()
        if line
    ]
    by_id = {r["pcdb_id"]: r for r in records}
    assert by_id[98]["brand_name"] == "Baxi Heating"
    assert by_id[98]["winter_efficiency_pct"] == 66.0
    assert by_id[98]["summer_efficiency_pct"] == 56.0
    assert by_id[98]["raw"][0] == "000098"  # raw[0] = pcdb_id (left-padded)


def test_parse_table_raw_extracts_heat_pump_records_from_real_pcdb_dat() -> None:
    """Generic positional walker against Table 362 (Heat Pumps). Per-field
    typing is deferred to a future slice once heat-pump records are ground-
    truth verified; for now the parser only commits to pcdb_id + raw row.
    Asserts the walker handles a table other than 105 and produces non-
    empty output with the expected shape."""
    # Arrange
    dat_text = _PCDB_DAT_PATH.read_text(encoding="latin-1")

    # Act
    records = parse_table_raw(dat_text, table_id="362")

    # Assert
    assert len(records) > 0
    first = records[0]
    assert isinstance(first.pcdb_id, int)
    assert first.pcdb_id > 0
    assert first.raw[0].lstrip("0") == str(first.pcdb_id) or first.raw[0] == "000000"
    assert len(first.raw) > 1  # multi-field row


def test_run_etl_writes_all_pcdb_table_jsonl_files(tmp_path: Path) -> None:
    """Per the user-chosen scope-D ingestion: ETL produces JSONL for
    every PCDB table of interest (105 typed; 322 typed via
    `parse_table_322`; 122/143/313/353/362/391/506 as untyped pcdb_id
    + raw). Per-table typed refinement is the job of follow-up slices
    when their cert-side wiring lands."""
    # Arrange
    expected_filenames = {
        "pcdb_table_105_gas_oil_boilers.jsonl",
        "pcdb_table_122_solid_fuel_boilers.jsonl",
        "pcdb_table_143_micro_cogen.jsonl",
        "pcdb_table_313_flue_gas_heat_recovery.jsonl",
        "pcdb_table_322_decentralised_mev.jsonl",
        "pcdb_table_329_mv_in_use_factors.jsonl",
        "pcdb_table_353_waste_water_heat_recovery.jsonl",
        "pcdb_table_362_heat_pumps.jsonl",
        "pcdb_table_391_high_heat_retention_storage_heaters.jsonl",
        "pcdb_table_506_heat_interface_units.jsonl",
    }
    output_dir = tmp_path / "pcdb_json"

    # Act
    run_etl(source=_PCDB_DAT_PATH, output_dir=output_dir)

    # Assert
    written = {p.name for p in output_dir.iterdir()}
    assert expected_filenames.issubset(written)