Model/domain/sap10_calculator/tables/pcdb/etl.py
Khalim Conn-Kowlessar 34cbd7d66c feat(pcdb): parse Table 323 (Centralised MEV / MVHR) + Table 329 efficiency IUF
MVHR (24a) heat-recovery support, part 1: the PCDB data layer.

PCDB Table 323 (PCDF Spec Rev 6b §A.18, Format 426; pcdb10.dat carries
Format 431, header `$323,431,...`) holds the per-wet-room SFP + heat-
exchanger efficiency for centralised MEV / MVHR units. Added
`MvhrRecord` / `MvhrDataPoint`, `parse_centralised_mv_row` /
`parse_table_323`, the ETL step, the committed jsonl, and the
`mvhr_record(pcdb_id)` runtime lookup (mirrors Table 322).

SAP 10.2 §2.6.4/§2.6.6: "MVHR ... SFP is a single value depending on the
number of wet rooms" — each test group's leading field is the wet-room
count; callers select the group matching the dwelling lodgement.
Worksheet-proven on simulated case 49 (000565, 2 wet rooms, Vent Axia
Sentinel Kinetic B 500140 → flow 21.0, SFP 0.88, efficiency 91%).

Also decoded the MVHR heat-recovery efficiency in-use factor from Table
329 (Format 432): system_type 3 ducts-inside-envelope = 0.90 (case-49
(23c) = 91 × 0.90 = 81.9%), cross-checked against system_type 10 = 0.70
(= SAP 10.2 Table 4g default heat-recovery in-use factor). "Table 4h is
no longer used – data now stored in the PCDB" (SAP 10.2 p.176).

The outside-envelope efficiency columns + with-scheme SFP blocks are
preserved verbatim in `raw` (no fixture exercises them yet).

Note: pyright strict type gate not run locally (pyright not installed).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-23 19:29:17 +00:00

148 lines
6 KiB
Python

"""ETL: parse BRE PCDB pcdb10.dat into per-table JSON files.
Idempotent. Re-run when BRE publishes an updated pcdb10.dat. JSON files
are committed in-repo alongside the source .dat so callers can load
without a build step. Run via `python -m domain.sap10_calculator.tables.pcdb.etl`.
Reference: BRE PCDB pcdb10.dat (April 2026 revision).
"""
from __future__ import annotations
import json
from dataclasses import asdict
from pathlib import Path
from domain.sap10_calculator.tables.pcdb.parser import (
DecentralisedMevRecord,
GasOilBoilerRecord,
MvhrRecord,
MvInUseFactorsRecord,
RawPcdbRecord,
parse_table_105,
parse_table_322,
parse_table_323,
parse_table_329,
parse_table_raw,
)
_TABLE_105_OUTPUT_FILENAME: str = "pcdb_table_105_gas_oil_boilers.jsonl"
_TABLE_322_OUTPUT_FILENAME: str = "pcdb_table_322_decentralised_mev.jsonl"
_TABLE_323_OUTPUT_FILENAME: str = "pcdb_table_323_centralised_mev_mvhr.jsonl"
_TABLE_329_OUTPUT_FILENAME: str = "pcdb_table_329_mv_in_use_factors.jsonl"
# Tables ingested as `RawPcdbRecord` (pcdb_id + raw) — per-field typing is
# deferred to follow-up slices when the cert-side wiring for each table
# lands.
_RAW_TABLES: dict[str, str] = {
"122": "pcdb_table_122_solid_fuel_boilers.jsonl",
"143": "pcdb_table_143_micro_cogen.jsonl",
"313": "pcdb_table_313_flue_gas_heat_recovery.jsonl",
"353": "pcdb_table_353_waste_water_heat_recovery.jsonl",
"362": "pcdb_table_362_heat_pumps.jsonl",
"391": "pcdb_table_391_high_heat_retention_storage_heaters.jsonl",
"506": "pcdb_table_506_heat_interface_units.jsonl",
}
def _gas_oil_record_to_jsonable(record: GasOilBoilerRecord) -> dict[str, object]:
"""Serialise a typed Table 105 record into a JSON-safe dict."""
serialisable = asdict(record)
serialisable["raw"] = list(record.raw)
return serialisable
def _raw_record_to_jsonable(record: RawPcdbRecord) -> dict[str, object]:
"""Serialise a generic raw PCDB record into a JSON-safe dict."""
return {"pcdb_id": record.pcdb_id, "raw": list(record.raw)}
def _write_ndjson(*, output_path: Path, records: list[dict[str, object]]) -> None:
"""Newline-delimited JSON: one record per line, no top-level array,
no indent. Diffs are line-granular when records are added/changed."""
lines = [json.dumps(record, ensure_ascii=False) for record in records]
output_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
def run_etl(*, source: Path, output_dir: Path) -> None:
"""Read `source` (pcdb10.dat), parse Table 105 (typed) plus the raw
tables enumerated in `_RAW_TABLES`, and write one newline-delimited
JSON file (`.jsonl`) per table under `output_dir/`. Idempotent;
record order preserves source order for diff-friendliness."""
output_dir.mkdir(parents=True, exist_ok=True)
dat_text = source.read_text(encoding="latin-1")
_write_ndjson(
output_path=output_dir / _TABLE_105_OUTPUT_FILENAME,
records=[_gas_oil_record_to_jsonable(r) for r in parse_table_105(dat_text)],
)
# Table 322 (Decentralised MEV) — typed via `parse_table_322` so the
# per-fan-configuration block (config_code, flow, SFP triplets) is
# exposed for the SAP 10.2 §2.6.4 SFPav cascade. Stored as raw row +
# typed-on-load (consistent with Table 362 pattern at `__init__.py`).
_write_ndjson(
output_path=output_dir / _TABLE_322_OUTPUT_FILENAME,
records=[
_decentralised_mev_record_to_jsonable(r)
for r in parse_table_322(dat_text)
],
)
# Table 323 (Centralised MEV and MVHR) — typed via `parse_table_323`,
# exposing the per-wet-room SFP + heat-recovery-efficiency test points
# for the SAP 10.2 §2.6.4/§2.6.6 MVHR cascade. Stored as raw row +
# typed-on-load (consistent with Table 322).
_write_ndjson(
output_path=output_dir / _TABLE_323_OUTPUT_FILENAME,
records=[_mvhr_record_to_jsonable(r) for r in parse_table_323(dat_text)],
)
# Table 329 (MV In-Use Factors) — typed via `parse_table_329`,
# exposing the per-ducting-type SFP IUF multipliers for "no
# approved scheme" installations (the only variant our cohort
# exercises). Stored as raw row + typed-on-load.
_write_ndjson(
output_path=output_dir / _TABLE_329_OUTPUT_FILENAME,
records=[
_mv_in_use_factors_record_to_jsonable(r)
for r in parse_table_329(dat_text)
],
)
for table_id, filename in _RAW_TABLES.items():
_write_ndjson(
output_path=output_dir / filename,
records=[_raw_record_to_jsonable(r) for r in parse_table_raw(dat_text, table_id)],
)
def _decentralised_mev_record_to_jsonable(
record: DecentralisedMevRecord,
) -> dict[str, object]:
"""Serialise a typed Table 322 record as `{pcdb_id, raw}` — same
shape as `_raw_record_to_jsonable` so the on-disk format is
identical between raw and typed tables. The lookup re-decodes via
`parse_decentralised_mev_row` at import time."""
return {"pcdb_id": record.pcdb_id, "raw": list(record.raw)}
def _mvhr_record_to_jsonable(record: MvhrRecord) -> dict[str, object]:
"""Serialise a typed Table 323 record as `{pcdb_id, raw}` — same
shape as the other typed tables; the lookup re-decodes via
`parse_centralised_mv_row` at import time."""
return {"pcdb_id": record.pcdb_id, "raw": list(record.raw)}
def _mv_in_use_factors_record_to_jsonable(
record: MvInUseFactorsRecord,
) -> dict[str, object]:
"""Serialise a typed Table 329 record. Table 329 is keyed by
`system_type` rather than `pcdb_id`, so this dict uses `system_type`
as the primary identifier; lookup callers `mv_in_use_factors(
system_type)` resolve via the same key."""
return {"system_type": record.system_type, "raw": list(record.raw)}
if __name__ == "__main__": # pragma: no cover — manual ETL invocation
data_dir = Path(__file__).resolve().parent / "data"
run_etl(
source=data_dir / "pcdb10.dat",
output_dir=data_dir,
)