chore(debug): summary_to_sap.py — Elmhurst Summary PDF -> our SAP + trail 🟪

Runs a Summary PDF through the chain-test path (pdftotext ->
ElmhurstSiteNotesExtractor -> from_elmhurst_site_notes) into Sap10Calculator
and dumps SAP + per-end-use kWh + the intermediate worksheet trail, for
diffing our calc against the accompanying Elmhurst U985 worksheet PDF.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Khalim Conn-Kowlessar 2026-06-28 21:26:30 +00:00
parent 0851b48807
commit 02ef67fd8f

94
scripts/summary_to_sap.py Normal file
View file

@ -0,0 +1,94 @@
"""Elmhurst Summary PDF -> EpcPropertyData -> Sap10Calculator, with a dump of
our SAP score + per-end-use kWh + the `intermediate` worksheet trail, for
diffing against the accompanying Elmhurst worksheet PDF.
Usage:
python -m scripts.summary_to_sap "<path to Summary_*.pdf>"
Reuses the exact preprocessing the Summary->EpcPropertyData chain test uses
(`backend/documents_parser/tests/test_summary_pdf_mapper_chain.py`):
`pdftotext -layout` -> Textract-style label/value stream -> extractor ->
`from_elmhurst_site_notes` mapper.
"""
from __future__ import annotations
import re
import subprocess
import sys
from pathlib import Path
from backend.documents_parser.elmhurst_extractor import ElmhurstSiteNotesExtractor
from datatypes.epc.domain.mapper import EpcPropertyDataMapper
from domain.sap10_calculator.calculator import Sap10Calculator
from domain.sap10_calculator.rdsap.cert_to_inputs import cert_to_inputs
def _summary_pdf_to_textract_style_pages(pdf_path: Path) -> list[str]:
info = subprocess.run(
["pdfinfo", str(pdf_path)], capture_output=True, text=True, check=True
).stdout
m = re.search(r"Pages:\s+(\d+)", info)
if m is None:
raise RuntimeError(f"Could not parse page count from {pdf_path}")
page_count = int(m.group(1))
pages: list[str] = []
for i in range(1, page_count + 1):
layout = subprocess.run(
["pdftotext", "-layout", "-f", str(i), "-l", str(i), str(pdf_path), "-"],
capture_output=True, text=True, check=True,
).stdout
tokens: list[str] = []
for line in layout.splitlines():
if not line.strip():
tokens.append("")
continue
tokens.extend(p for p in re.split(r"\s{2,}", line.strip()) if p)
pages.append("\n".join(tokens))
return pages
def main(pdf: str) -> None:
pdf_path = Path(pdf)
pages = _summary_pdf_to_textract_style_pages(pdf_path)
survey = ElmhurstSiteNotesExtractor(pages).extract()
epc = EpcPropertyDataMapper.from_elmhurst_site_notes(survey)
inp = cert_to_inputs(epc)
r = Sap10Calculator().calculate(epc)
p = epc.sap_building_parts[0] if epc.sap_building_parts else None
print(f"=== {pdf_path.name} ===")
print(f"dwelling_type={epc.dwelling_type!r} property_type={epc.property_type!r} "
f"age_band={p.construction_age_band if p else None} TFA={epc.total_floor_area_m2}")
print(f"OUR SAP = {r.sap_score} ({r.sap_score_continuous:.4f}) "
f"CO2={r.co2_kg_per_yr/1000:.3f} t/yr PEUI={r.primary_energy_kwh_per_m2:.1f}")
print("--- per end use (kWh/yr) ---")
print(f" space_heating useful = {r.space_heating_kwh_per_yr:.1f}")
print(f" main_heating fuel = {r.main_heating_fuel_kwh_per_yr:.1f}")
print(f" secondary fuel = {r.secondary_heating_fuel_kwh_per_yr:.1f}")
print(f" hot_water = {r.hot_water_kwh_per_yr:.1f}")
print(f" lighting = {r.lighting_kwh_per_yr:.1f}")
print(f" pumps_fans = {r.pumps_fans_kwh_per_yr:.1f}")
print(f" delivered fuel total = {r.intermediate.get('delivered_fuel_kwh_per_yr', float('nan')):.1f}")
print("--- costs / rating ---")
for k in ("main_heating_cost_gbp", "secondary_heating_cost_gbp", "hot_water_cost_gbp",
"pumps_fans_cost_gbp", "lighting_cost_gbp", "ecf"):
print(f" {k:28s} {r.intermediate.get(k, float('nan')):.4f}")
print(f" is_off_peak={r.is_off_peak_meter} main_hrf={r.main_heating_high_rate_fraction} "
f"hw_hrf={r.hot_water_high_rate_fraction:.4f} other_hrf={r.other_electricity_high_rate_fraction}")
print(f" space £/kWh={inp.space_heating_fuel_cost_gbp_per_kwh} "
f"hw £/kWh={inp.hot_water_fuel_cost_gbp_per_kwh} other £/kWh={inp.other_fuel_cost_gbp_per_kwh}")
print("--- heat balance (intermediate) ---")
for k in ("heat_transfer_coefficient_w_per_k", "heat_loss_parameter_w_per_m2k",
"walls_w_per_k", "roof_w_per_k", "floor_w_per_k", "party_walls_w_per_k",
"windows_w_per_k", "doors_w_per_k", "thermal_bridging_w_per_k",
"infiltration_w_per_k", "infiltration_ach", "internal_gains_annual_avg_w",
"mean_internal_temp_annual_avg_c", "useful_space_heating_kwh_per_yr"):
print(f" {k:38s} {r.intermediate.get(k, float('nan')):.4f}")
if __name__ == "__main__":
if len(sys.argv) != 2:
print(__doc__)
sys.exit(2)
main(sys.argv[1])