survey-extraction/smart_epc_explorer.py

import sys
import json
from pprint import pprint

sys.path.insert(0, '/workspaces/survey-extractor')

from etl.fileReader.pdfReaderToText import pdfReaderToText

# Set your PDF path here
SOURCE_PDF_PATH = "/workspaces/survey-extractor/RdSAP_SiteNote_95053636_V3_Assessment (1).pdf"

reader = pdfReaderToText(SOURCE_PDF_PATH)

print(f"Detected type: {reader.type}")
print(f"Total text tokens: {len(reader.text_list)}\n")
print("--- Raw Token List ---")
for i, line in enumerate(reader.text_list):
    print(f"[{i:04d}] {repr(line)}")

print("\n\n--- Extracted Structured Data ---")
extractor = reader.get_reader()
obj = extractor.master_obj

sections = [
    ("Header", obj.header),
    ("General", obj.general),
    ("Building Construction", obj.building_construction),
    ("Roof Space", obj.roof_space),
    ("Main Heating", obj.main_heating),
    ("Secondary Heating", obj.secondary_heating),
    ("Water Heating", obj.water_heating),
    ("Ventilation", obj.ventilation),
    ("Renewables", obj.renewables),
    ("Room Count", obj.room_count),
    ("Misc", obj.misc),
    ("Customer Response", obj.customer_response),
    ("Addendum", obj.addendum),
]

for title, section in sections:
    print(f"\n=== {title} ===")
    pprint(section.model_dump())

print(f"\n=== Windows ({len(obj.windows)} found) ===")
for w in obj.windows:
    pprint(w.model_dump())

print("\n\n--- Full JSON ---")
print(json.dumps(obj.model_dump(), indent=2, default=str))