Model/domain/sap10_calculator/tables/pcdb/parser.py

"""Per-table row parsers for BRE PCDB pcdb10.dat records.

Each PCDB table has its own CSV-shaped record format documented by BRE
(format codes in `$<table>,<format>,...` headers of pcdb10.dat). Field
positions are reverse-engineered from sample records and cross-checked
against ground-truth records published at https://www.ncm-pcdb.org.uk.

The parsers expose two layers per record:
- Typed high-confidence fields (pcdb_id, manufacturer, model, winter/
  summer efficiency, etc.) named per BRE's web entry vocabulary.
- The full raw row as a tuple of strings, for forensics on undecoded
  fields and audit trails when BRE bumps the format version.

Reference: BRE PCDB pcdb10.dat April 2026; user-verified web records.
"""

from __future__ import annotations

from dataclasses import dataclass
from typing import Final, Optional


def _parse_optional_float(value: str) -> Optional[float]:
    """Empty PCDB fields are blank strings, not 'null'. Treat blank or
    non-numeric (e.g. '>70kW' range indicator on output-power fields) as
    None — the raw value is preserved on the record's `raw` tuple."""
    value = value.strip()
    if not value:
        return None
    try:
        return float(value)
    except ValueError:
        return None


def _parse_optional_int(value: str) -> Optional[int]:
    """Some PCDB fields carry status strings ('obsolete', 'discontinued')
    where a year would otherwise live. Treat any non-numeric value as
    missing rather than erroring — the status is preserved on `raw`."""
    value = value.strip()
    if not value:
        return None
    try:
        return int(value)
    except ValueError:
        return None


@dataclass(frozen=True)
class GasOilBoilerRecord:
    """SAP 10.2 Appendix D2.1 PCDB record — Table 105 (Gas and Oil Boilers).

    Field positions verified against the ncm-pcdb.org.uk web entry for
    pcdb_id 000098 (Baxi Heating Wm 20/3rs): winter eff = 66.0%, summer
    eff = 56.0%, comparative HW = 40.8%, output 5.86 kW, final-year 1990.
    """

    pcdb_id: int
    brand_name: str
    model_name: str
    model_qualifier: str
    winter_efficiency_pct: Optional[float]
    summer_efficiency_pct: Optional[float]
    comparative_hot_water_efficiency_pct: Optional[float]
    output_kw_max: Optional[float]
    final_year_of_manufacture: Optional[int]
    # SAP10.2 Appendix J Table 3b/3c — combi-loss fields per BRE PCDF Spec
    # Rev 6b (12 May 2021), Gas and Oil Boiler Table, fields 48 / 51 / 52
    # / 56 / 57 (see `domain/sap10_calculator/docs/specs/PCDF_Spec_Rev-06b_12_May_2021.pdf`
    # pp. 14-15). Populated only for boilers EN 13203-2 / OPS 26 tested;
    # SAP-default boilers leave them all blank → `separate_dhw_tests=0`
    # and (61)m falls back to Table 3a. Field 48 encodes the test
    # schedules: 0=none, 1=schedule 2 only (profile M → Table 3b row 1),
    # 2=schedules 2 and 3 (profiles M+L → Table 3c), 3=schedules 2 and 1
    # (profiles M+S → Table 3c). Field 55 (r2) is lodged but explicitly
    # excluded from SAP assessments ("only r1") so it is not surfaced.
    # PCDF Spec Rev 6b field 16 (0-idx 15): 0=normal, 1=integral FGHRS,
    # 2=combined HP+boiler, 3=combined HP+boiler+FGHRS. Gates the Table
    # 3b/3c row selection — only `subsidiary_type=0` exercises the
    # "Instantaneous with non-storage FGHRS or without FGHRS" row 1.
    subsidiary_type: Optional[int]
    # PCDF Spec Rev 6b field 39 (0-idx 38): 0=not storage combi, 1=primary
    # water store, 2=secondary store, 3=CPSU. Gates storage-combi rows in
    # Table 3b/3c (deferred until a fixture exercises).
    store_type: Optional[int]
    separate_dhw_tests: Optional[int]
    rejected_energy_proportion_r1: Optional[float]
    loss_factor_f1_kwh_per_day: Optional[float]
    loss_factor_f2_kwh_per_day: Optional[float]
    rejected_factor_f3_per_litre: Optional[float]
    raw: tuple[str, ...]


_TABLE_HEADER_PREFIX: str = "$"
_COMMENT_PREFIX: str = "#"
_TABLE_105_HEADER_ID: str = "105"


def _walk_table_records(dat_text: str, table_id: str) -> list[str]:
    """Yield record rows inside the named PCDB table section.

    The .dat file demarcates each table with a `$<id>,<format>,...` header
    on its own line. Records run from that header until the next `$<id>`
    header or end-of-input. `#`-prefixed lines are comments; blank lines
    are skipped too.
    """
    inside_target_table = False
    rows: list[str] = []
    for raw_line in dat_text.splitlines():
        line = raw_line.rstrip("\r")
        stripped = line.strip()
        if not stripped or stripped.startswith(_COMMENT_PREFIX):
            continue
        if stripped.startswith(_TABLE_HEADER_PREFIX):
            inside_target_table = stripped[1:].split(",", 1)[0] == table_id
            continue
        if inside_target_table:
            rows.append(line)
    return rows


@dataclass(frozen=True)
class RawPcdbRecord:
    """Untyped PCDB record — pcdb_id keyed lookup + raw row for future
    per-table typed refinement. Used for tables (122/143/362/391/313/353/
    506) where field positions have not yet been ground-truth verified."""

    pcdb_id: int
    raw: tuple[str, ...]


@dataclass(frozen=True)
class PsrEfficiencyGroup:
    """One PSR-dependent group from a Table 362 heat-pump record.
    Format 465 stores each group as 9 raw fields; the three populated
    positions are tabulated here for SAP 10.2 Appendix N interpolation:

      psr               plant size ratio (decimal, e.g. 0.2, 0.5, 1.0)
      eta_space_1_pct   space heating thermal efficiency (% gross)
                        — used by N3.6: (206) = 0.95 × eta_space_1
      eta_water_3_pct   calculated water heating thermal efficiency
                        (% gross) for HPs providing both space + water
                        — used by N3.7(a) + footnote 49: (217) =
                        in_use_factor × eta_water_3 (in_use_factor per
                        N3.7 table — 0.95 or 0.60 depending on whether
                        the cert's cylinder meets the PCDB-lodged
                        criteria of volume / HX area / heat loss).
    """

    psr: float
    eta_space_1_pct: float
    eta_water_3_pct: float


@dataclass(frozen=True)
class HeatPumpRecord:
    """SAP 10.2 Appendix N PCDB record — Table 362 (Heat Pumps).

    Format 465 of pcdb10.dat (April 2026 revision) extends the published
    PCDF Spec Rev 6b §A.23 format 464 with additional header fields and
    a larger PSR-group set (up to 14 groups). Field positions are
    reverse-engineered against the BRE web entry at
    https://www.ncm-pcdb.org.uk/sap/pcdbdetails.jsp?type=362&id=<pcdb_id>;
    Mitsubishi PUZ-WM50VHA (104568) and Daikin EDLQ05CAV3 (102421)
    provide the cohort ground-truth.

    Encoded fields per format 464 §A.23 docs (vocabulary preserved):
      fuel             39 = electricity (Note: SAP 10.2 spec line 5901
                       allows non-electric heat pumps too)
      service_provision 1 = space + water heating all year
                       2 = space + water during heating season only
                       3 = space heating only
                       4 = water heating only
      hw_vessel_mode    1 = integral vessel
                       2 = separate and specified vessel (fields 19-21)
                       3 = separate but unspecified vessel
                       4 = none (service provision code 3)
      vessel_volume_l, vessel_heat_loss_kwh_per_day,
      vessel_heat_exchanger_area_m2: per spec §A.23 field 19/20/21 —
      only populated when `hw_vessel_mode in {1, 2}`.

    `max_output_kw` (spec §A.23 field 30) is the PSR-denominator per
    PDF p.100 line 5946 ("maximum nominal output of the package").

    `heating_duration_code` (format-465 position 48) encodes the
    package's daily heating duration per SAP 10.2 Appendix N3.5 (PDF
    p.105 line 6099): "24", "16", "9", or "V" (Variable). Drives the
    extended-heating-schedule day allocation via Table N4/N5. Per
    footnote 48, modern records always lodge "V"; the fixed durations
    are retained for legacy purposes.

    `psr_groups` carries the PSR-dependent efficiency table (up to 14
    rows) used by SAP 10.2 Appendix N3.6 (space heating) and N3.7(a)
    (water heating), interpolated at the dwelling's PSR per spec PDF
    p.100 line 5957.
    """

    pcdb_id: int
    brand_name: str
    model_name: str
    model_qualifier: str
    fuel: Optional[int]
    service_provision: Optional[int]
    hw_vessel_mode: Optional[int]
    vessel_volume_l: Optional[float]
    vessel_heat_loss_kwh_per_day: Optional[float]
    vessel_heat_exchanger_area_m2: Optional[float]
    max_output_kw: Optional[float]
    heating_duration_code: Optional[str]
    psr_groups: tuple[PsrEfficiencyGroup, ...]
    raw: tuple[str, ...]


# Format 465 field offsets in the raw row (0-indexed). Derived by
# cross-referencing pcdb10.dat record 104568 (Mitsubishi Ecodan 5.0 kW)
# with the BRE web entry's labelled values.
_HP_IDX_BRAND_NAME: Final[int] = 6
_HP_IDX_MODEL_NAME: Final[int] = 7
_HP_IDX_MODEL_QUALIFIER: Final[int] = 8
_HP_IDX_FUEL: Final[int] = 16
_HP_IDX_SERVICE_PROVISION: Final[int] = 22
_HP_IDX_HW_VESSEL_MODE: Final[int] = 23
_HP_IDX_VESSEL_VOLUME_L: Final[int] = 24
_HP_IDX_VESSEL_HEAT_LOSS_KWH_PER_DAY: Final[int] = 25
_HP_IDX_VESSEL_HEAT_EXCHANGER_AREA_M2: Final[int] = 26
_HP_IDX_MAX_OUTPUT_KW: Final[int] = 47
# Format 465 position 48 — daily heating duration code per SAP 10.2
# Appendix N3.5 (PDF p.105 line 6099). Cohort ground-truth: "V" lodged
# on Mitsubishi PUZ-WM50VHA (104568) and Daikin EDLQ05CAV3 (102421).
_HP_IDX_HEATING_DURATION_CODE: Final[int] = 48

# Format 465 PSR-group block: idx[58] is the group count; groups start
# at idx[59], 9 fields wide, with PSR / η_space,1 / η_water,3 at the
# offsets below within each group.
_HP_IDX_NUM_PSR_GROUPS: Final[int] = 58
_HP_PSR_GROUP_START: Final[int] = 59
_HP_PSR_GROUP_STRIDE: Final[int] = 9
_HP_PSR_GROUP_OFFSET_PSR: Final[int] = 0
_HP_PSR_GROUP_OFFSET_ETA_SPACE_1: Final[int] = 2
_HP_PSR_GROUP_OFFSET_ETA_WATER_3: Final[int] = 6


def _parse_psr_groups(raw: tuple[str, ...]) -> tuple[PsrEfficiencyGroup, ...]:
    """Decode the variable-length PSR-dependent block of a format-465
    heat-pump record. The count comes from `idx[58]`; each subsequent
    group spans 9 raw fields with PSR / η_space,1 / η_water,3 at
    offsets 0 / 2 / 6 within the group.
    """
    if _HP_IDX_NUM_PSR_GROUPS >= len(raw):
        return ()
    count = _parse_optional_int(raw[_HP_IDX_NUM_PSR_GROUPS])
    if count is None or count <= 0:
        return ()
    groups: list[PsrEfficiencyGroup] = []
    for group_idx in range(count):
        base = _HP_PSR_GROUP_START + group_idx * _HP_PSR_GROUP_STRIDE
        if base + _HP_PSR_GROUP_OFFSET_ETA_WATER_3 >= len(raw):
            break
        psr = _parse_optional_float(raw[base + _HP_PSR_GROUP_OFFSET_PSR])
        eta_space_1 = _parse_optional_float(
            raw[base + _HP_PSR_GROUP_OFFSET_ETA_SPACE_1]
        )
        eta_water_3 = _parse_optional_float(
            raw[base + _HP_PSR_GROUP_OFFSET_ETA_WATER_3]
        )
        if psr is None or eta_space_1 is None or eta_water_3 is None:
            continue
        groups.append(
            PsrEfficiencyGroup(
                psr=psr,
                eta_space_1_pct=eta_space_1,
                eta_water_3_pct=eta_water_3,
            )
        )
    return tuple(groups)


def interpolate_heat_pump_efficiency_at_psr(
    psr_groups: tuple[PsrEfficiencyGroup, ...],
    *,
    target_psr: float,
) -> tuple[float, float]:
    """SAP 10.2 PDF p.100 line 5957 — linear interpolation between the
    two PSR rows enclosing `target_psr`. Returns `(eta_space_1_pct,
    eta_water_3_pct)` at the dwelling's PSR.

    Per spec PDF p.101 lines 6007-6008: clamp to the smallest PSR
    in the record when `target_psr` is below it, and to the largest
    when above ("if the PSR is greater than the largest PSR in the
    database record then the heat pump space and water heating
    fractions for the largest PSR should be used, and if the PSR is
    less than the smallest PSR in the database record then the heat
    pump space and water heating fractions for the smallest PSR
    should be used").
    """
    if not psr_groups:
        raise ValueError("PSR groups required for interpolation")
    if target_psr <= psr_groups[0].psr:
        first = psr_groups[0]
        return (first.eta_space_1_pct, first.eta_water_3_pct)
    if target_psr >= psr_groups[-1].psr:
        last = psr_groups[-1]
        return (last.eta_space_1_pct, last.eta_water_3_pct)
    for low_group, high_group in zip(psr_groups, psr_groups[1:]):
        if low_group.psr <= target_psr <= high_group.psr:
            span = high_group.psr - low_group.psr
            t = (target_psr - low_group.psr) / span if span > 0 else 0.0
            eta_space_1 = (
                low_group.eta_space_1_pct
                + (high_group.eta_space_1_pct - low_group.eta_space_1_pct) * t
            )
            eta_water_3 = (
                low_group.eta_water_3_pct
                + (high_group.eta_water_3_pct - low_group.eta_water_3_pct) * t
            )
            return (eta_space_1, eta_water_3)
    # Unreachable: target_psr is between min and max so a bracket exists.
    raise AssertionError("PSR bracket not found despite range check")


def parse_heat_pump_row_raw(raw: tuple[str, ...]) -> HeatPumpRecord:
    """Decode a Table 362 format-465 raw row into a typed `HeatPumpRecord`.

    Tolerates missing trailing fields (older partially-populated records)
    by reading via index helpers that return None for short rows.
    """
    def at(idx: int) -> str:
        return raw[idx] if idx < len(raw) else ""

    duration_raw = at(_HP_IDX_HEATING_DURATION_CODE).strip()
    return HeatPumpRecord(
        pcdb_id=int(raw[0]),
        brand_name=at(_HP_IDX_BRAND_NAME),
        model_name=at(_HP_IDX_MODEL_NAME),
        model_qualifier=at(_HP_IDX_MODEL_QUALIFIER),
        fuel=_parse_optional_int(at(_HP_IDX_FUEL)),
        service_provision=_parse_optional_int(at(_HP_IDX_SERVICE_PROVISION)),
        hw_vessel_mode=_parse_optional_int(at(_HP_IDX_HW_VESSEL_MODE)),
        vessel_volume_l=_parse_optional_float(at(_HP_IDX_VESSEL_VOLUME_L)),
        vessel_heat_loss_kwh_per_day=_parse_optional_float(
            at(_HP_IDX_VESSEL_HEAT_LOSS_KWH_PER_DAY)
        ),
        vessel_heat_exchanger_area_m2=_parse_optional_float(
            at(_HP_IDX_VESSEL_HEAT_EXCHANGER_AREA_M2)
        ),
        max_output_kw=_parse_optional_float(at(_HP_IDX_MAX_OUTPUT_KW)),
        heating_duration_code=duration_raw if duration_raw else None,
        psr_groups=_parse_psr_groups(raw),
        raw=raw,
    )


def parse_table_raw(dat_text: str, table_id: str) -> list[RawPcdbRecord]:
    """Generic positional walker: extract pcdb_id + raw row for any PCDB
    table, no per-field decoding. Future typed parsers (e.g. Table 362
    heat pumps) refine specific fields without changing this contract.
    """
    rows = _walk_table_records(dat_text, table_id)
    return [
        RawPcdbRecord(pcdb_id=int(fields[0]), raw=fields)
        for row in rows
        for fields in (tuple(row.split(",")),)
    ]


def parse_table_105(dat_text: str) -> list[GasOilBoilerRecord]:
    """Walk a PCDB dat string, yielding parsed Table 105 (Gas and Oil
    Boilers) records via `parse_table_105_row`."""
    return [parse_table_105_row(row) for row in _walk_table_records(dat_text, _TABLE_105_HEADER_ID)]


def parse_table_105_row(row: str) -> GasOilBoilerRecord:
    """Decode one Table 105 (Gas and Oil Boilers) record row into a typed
    record. Field positions (1-indexed): 1 pcdb_id, 6 brand_name,
    7 model_name, 8 model_qualifier, 11 final_year, 23 output_kw_max,
    26 winter_efficiency_pct, 27 summer_efficiency_pct, 29 comparative
    hot water efficiency. Trailing fields preserved verbatim in `raw`."""
    fields = tuple(row.rstrip("\r\n").split(","))
    return GasOilBoilerRecord(
        pcdb_id=int(fields[0]),
        brand_name=fields[5],
        model_name=fields[6],
        model_qualifier=fields[7],
        final_year_of_manufacture=_parse_optional_int(fields[10]),
        output_kw_max=_parse_optional_float(fields[22]),
        winter_efficiency_pct=_parse_optional_float(fields[25]),
        summer_efficiency_pct=_parse_optional_float(fields[26]),
        comparative_hot_water_efficiency_pct=_parse_optional_float(fields[28]),
        subsidiary_type=_parse_optional_int(fields[15]),
        store_type=_parse_optional_int(fields[38]),
        separate_dhw_tests=_parse_optional_int(fields[47]),
        rejected_energy_proportion_r1=_parse_optional_float(fields[50]),
        loss_factor_f1_kwh_per_day=_parse_optional_float(fields[51]),
        loss_factor_f2_kwh_per_day=_parse_optional_float(fields[55]),
        rejected_factor_f3_per_litre=_parse_optional_float(fields[56]),
        raw=fields,
    )