mirror of
https://github.com/Hestia-Homes/survey-extraction.git
synced 2026-06-08 11:17:29 +00:00
49 lines
1.4 KiB
Python
49 lines
1.4 KiB
Python
import sys
|
|
import json
|
|
from pprint import pprint
|
|
|
|
sys.path.insert(0, '/workspaces/survey-extractor')
|
|
|
|
from etl.fileReader.pdfReaderToText import pdfReaderToText
|
|
|
|
# Set your PDF path here
|
|
SOURCE_PDF_PATH = "/workspaces/survey-extractor/RdSAP_SiteNote_95053636_V3_Assessment (1).pdf"
|
|
|
|
reader = pdfReaderToText(SOURCE_PDF_PATH)
|
|
|
|
print(f"Detected type: {reader.type}")
|
|
print(f"Total text tokens: {len(reader.text_list)}\n")
|
|
print("--- Raw Token List ---")
|
|
for i, line in enumerate(reader.text_list):
|
|
print(f"[{i:04d}] {repr(line)}")
|
|
|
|
print("\n\n--- Extracted Structured Data ---")
|
|
extractor = reader.get_reader()
|
|
obj = extractor.master_obj
|
|
|
|
sections = [
|
|
("Header", obj.header),
|
|
("General", obj.general),
|
|
("Building Construction", obj.building_construction),
|
|
("Roof Space", obj.roof_space),
|
|
("Main Heating", obj.main_heating),
|
|
("Secondary Heating", obj.secondary_heating),
|
|
("Water Heating", obj.water_heating),
|
|
("Ventilation", obj.ventilation),
|
|
("Renewables", obj.renewables),
|
|
("Room Count", obj.room_count),
|
|
("Misc", obj.misc),
|
|
("Customer Response", obj.customer_response),
|
|
("Addendum", obj.addendum),
|
|
]
|
|
|
|
for title, section in sections:
|
|
print(f"\n=== {title} ===")
|
|
pprint(section.model_dump())
|
|
|
|
print(f"\n=== Windows ({len(obj.windows)} found) ===")
|
|
for w in obj.windows:
|
|
pprint(w.model_dump())
|
|
|
|
print("\n\n--- Full JSON ---")
|
|
print(json.dumps(obj.model_dump(), indent=2, default=str))
|