survey-extraction/smart_epc_explorer.py
Jun-te Kim 50ea324ca5 test
2026-03-30 18:31:05 +00:00

49 lines
1.4 KiB
Python

import sys
import json
from pprint import pprint
sys.path.insert(0, '/workspaces/survey-extractor')
from etl.fileReader.pdfReaderToText import pdfReaderToText
# Set your PDF path here
SOURCE_PDF_PATH = "/workspaces/survey-extractor/RdSAP_SiteNote_95053636_V3_Assessment (1).pdf"
reader = pdfReaderToText(SOURCE_PDF_PATH)
print(f"Detected type: {reader.type}")
print(f"Total text tokens: {len(reader.text_list)}\n")
print("--- Raw Token List ---")
for i, line in enumerate(reader.text_list):
print(f"[{i:04d}] {repr(line)}")
print("\n\n--- Extracted Structured Data ---")
extractor = reader.get_reader()
obj = extractor.master_obj
sections = [
("Header", obj.header),
("General", obj.general),
("Building Construction", obj.building_construction),
("Roof Space", obj.roof_space),
("Main Heating", obj.main_heating),
("Secondary Heating", obj.secondary_heating),
("Water Heating", obj.water_heating),
("Ventilation", obj.ventilation),
("Renewables", obj.renewables),
("Room Count", obj.room_count),
("Misc", obj.misc),
("Customer Response", obj.customer_response),
("Addendum", obj.addendum),
]
for title, section in sections:
print(f"\n=== {title} ===")
pprint(section.model_dump())
print(f"\n=== Windows ({len(obj.windows)} found) ===")
for w in obj.windows:
pprint(w.model_dump())
print("\n\n--- Full JSON ---")
print(json.dumps(obj.model_dump(), indent=2, default=str))